Setup Code

Load Libraries

library(rdhs) # for importing data from the API
library(tidyverse) # for data manipulation
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   1.0.1
## ✔ tibble  3.2.1     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.3     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(haven) # for handling labelled columns
library(sf) # for handling shapefiles
## Linking to GEOS 3.10.2, GDAL 3.4.2, PROJ 8.2.1; sf_use_s2() is TRUE
library(gstat) # for mapping and krigging
library(sp) # for coordinate transformations

library(spdep) # for getting area data weights
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
library(geojsonsf) # for converting shapefile into sp
library(ggvoronoi) # for tiling point data into area data
library(spatialreg) # for spatial regression
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 
## Attaching package: 'spatialreg'
## 
## The following objects are masked from 'package:spdep':
## 
##     get.ClusterOption, get.coresOption, get.mcOption,
##     get.VerboseOption, get.ZeroPolicyOption, set.ClusterOption,
##     set.coresOption, set.mcOption, set.VerboseOption,
##     set.ZeroPolicyOption
library(GWmodel) # for geographically weighted regression 
## Loading required package: maptools
## Checking rgeos availability: TRUE
## Please note that 'maptools' will be retired during 2023,
## plan transition at your earliest convenience;
## some functionality will be moved to 'sp'.
## Loading required package: robustbase
## Loading required package: Rcpp
## Welcome to GWmodel version 2.2-9.
library(caret) # decision tree with cross validation
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(rpart) # decision tree without cross validation
library(rpart.plot) # for plotting dtree results
library(Metrics) # for testing models
## 
## Attaching package: 'Metrics'
## 
## The following objects are masked from 'package:caret':
## 
##     precision, recall

Manual Working Directory Setup

  1. Put this file in a folder called “code”. This is your working directory (wd).
  2. Ensure you have the folder in the wd called “exports”
  3. Ensure you have a folder in the wd called “data”
  4. Ensure you have the folder “SLHR7ADT” inside of data, unzipped
  5. Ensure you have the folder “SLGE7AFL” inside of data, unzipped
  6. Ensure you have the folders with the regional and national boundaries as written below
  7. Run this code!

Import Data

# manually read in the stata file
sl19 <- read_dta('./data/SLHR7ADT/SLHR7AFL.DTA')
var_labels <- get_variable_labels(sl19)

# read in shapefiles of the country, regions, and survey clusters
# https://gis.stackexchange.com/questions/19064/opening-shapefile-in-r

# rad in the shapefile of the country
country <- read_sf(dsn = "./data/sdr_national_data_2023-04-17/shps", layer = "sdr_national_data_dhs_2019")
plot(country)
## Warning: plotting the first 9 out of 37 attributes; use max.plot = 37 to plot
## all

# read in the shapefile of regions
regions <- read_sf(dsn = "./data/sdr_subnational_boundaries_2023-04-17/shps", layer = "sdr_subnational_boundaries2")
plot(regions)
## Warning: plotting the first 9 out of 27 attributes; use max.plot = 27 to plot
## all

# read in shapefile of survey cluster coordinates
clusters <- read_sf(dsn = "./data/SLGE7AFL", layer = "SLGE7AFL")
plot(clusters)
## Warning: plotting the first 9 out of 20 attributes; use max.plot = 20 to plot
## all

# "hv001" in sl19 dataframe tells us which cluster number each household belongs to
# "DHSCLUST" in gps dataframe contains the cluster number
# "LATNUM" and "LONGNUM" contain the coordinates of each cluster
# we'll need to join these two dfs to create maps of the variables we're interested in
# https://sparkbyexamples.com/r-programming/how-to-do-left-join-in-r/
sl19gps <- merge(
      x=sl19, 
      y=clusters, 
      by.x="hv001", 
      by.y="DHSCLUST",
      all.x=TRUE)

# export useful data to exports folder
saveRDS(country, "./exports/country.rds")
saveRDS(regions, "./exports/regions.rds")
saveRDS(clusters, "./exports/clusters.rds")
saveRDS(var_labels, "./exports/var_labels.rds")
saveRDS(sl19, "./exports/sl19.rds")
saveRDS(sl19gps, "./exports/sl19gps.rds")

Clean and re-export

sl19gps <- readRDS("./exports/sl19gps.rds")
df <- sl19gps

# columns of interest
interestingColumns <- c("hv000","hv001","hv006","hv007","hv010","hv011","hv012","hv013","hv014","hv024","hv025","hv040","hv045c","hv201","hv204","hv205","hv206","hv207","hv208","hv209","hv210","hv211","hv212","hv213","hv214","hv215","hv216","hv217","hv219","hv220","hv221","hv226","hv227","hv230a","hv237","hv241","hv243a","hv243b","hv243c","hv243d","hv243e","hv244","hv245","hv246","hv246a","hv246b","hv246c","hv246d","hv246e","hv246f","hv247","hv270","hv271","hv270a","hv271a","hml1")
gpsColumns <- c("LATNUM", "LONGNUM", "geometry", "ALT_DEM", "DATUM", "DHSREGNA", "ADM1NAME", "DHSID")
keepColumns <- c(interestingColumns, gpsColumns)

# filter down columns
df <- df[,keepColumns]

# filter out 0 coordinates
df <- filter(df, LATNUM != 0)

# re-export
sl19clean <- df
saveRDS(sl19clean, "./exports/sl19clean.rds")

Create Maps of Variables

# Dropping all data with coordinates (0,0) 
plottable_df <- filter(sl19gps, LATNUM > 0)

# This is turning hv206 from"haven-labelled' class to numeric for compatability with geom_point function.
plottable_df$hv206 <- as.factor(plottable_df$hv206)

ggplot(data = regions) +
    geom_sf() +
    geom_point(data=plottable_df, aes(x=LONGNUM, y=LATNUM, color=hv206, alpha=0.3)) +
    scale_color_manual(values = c("1" = "yellow", "0" = "white"))+
    labs(color="Access to Electricity")+ 
    ggtitle('Household Access to Electricity in Sierra Leone') + 
    theme_minimal()+
    ggspatial::annotation_scale()+
    ggspatial::annotation_north_arrow(location = "tr", which_north = "true") +
    labs(y= "", x = "") +
    theme(axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks = element_blank(),
        rect = element_blank())

    theme(plot.title = element_text(size = 20))
## List of 1
##  $ plot.title:List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : num 20
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE
    # We have a bit of a problem in that each cluster represents several families, so most likely we are just seeing one family per cluster which isn't necessarily representative. Perhaps a spatial interpolation method map would give a better overall impression of electricity availability.
    
    plottable_df$hv271 <- as.numeric(plottable_df$hv271)
    
    ggplot(data = regions) +
    geom_sf() +
    geom_point(data=plottable_df, aes(x=LONGNUM, y=LATNUM, color=(hv271))) +
       scale_color_gradient(low="white",
                     high="darkgreen", space ="Lab")+ 
    labs(color="Wealth Level")+ 
    ggtitle('Household Wealth Distribution in Sierra Leone') + 
    theme_minimal()+
    ggspatial::annotation_scale()+
    ggspatial::annotation_north_arrow(location = "tr", which_north = "true") +
    labs(y= "", x = "") +
    theme(axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks = element_blank(),
        rect = element_blank())

    theme(plot.title = element_text(size = 20))
## List of 1
##  $ plot.title:List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : num 20
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

Summarize data

# import data
sl19keep <- readRDS("./exports/sl19clean.rds")

Histograms

# histograms

hist(sl19keep$hv040, main="Altitude (m)") # altitude in meters

plot(sl19keep$hv024, main="Region") # region

plot(sl19keep$hv045c, main="Native Language")

ilg_hist <- function(x, bins, label, fill) {
  return(ggplot() + 
      geom_histogram(aes(x), color="black", fill=fill, bins=bins) +
      xlab(label) +
      ylab("Count") +
      ggtitle(label) +
      theme_bw()
  )
}


household <- ilg_hist(sl19keep$hv012, 33, "# Household Members", "purple")
household

women <- ilg_hist(sl19keep$hv010, 11, "# Women aged 15-49", "purple")
women

men <- ilg_hist(sl19keep$hv011, 11, "# Men aged 15-49", "purple")
men

children <- ilg_hist(sl19keep$hv014, 10, "# Children under 5 y.o.", "purple")
children

altitude <- ilg_hist(sl19keep$hv040, 30, "Altitude", "blue")
altitude

timetowater <- ilg_hist(sl19keep$hv204, 30, "Time to Water Source", "darkgreen")
timetowater # filter out high "unknowns"
## Don't know how to automatically pick scale for object of type
## <haven_labelled/vctrs_vctr/double>. Defaulting to continuous.

Bar Charts

ilg_bar <- function(x, label, fill) {
  return(ggplot() + 
      geom_bar(aes(x), color="black", fill=fill) +
      xlab(label) +
      ylab("Count") +
      ggtitle(label) +
      theme_bw()
  )
}

region <- ilg_bar(sl19keep$hv024, "Region", "blue")
region
## Don't know how to automatically pick scale for object of type
## <haven_labelled/vctrs_vctr/double>. Defaulting to continuous.

language <- ilg_bar(sl19keep$hv045c, "Language", "blue")
language
## Don't know how to automatically pick scale for object of type
## <haven_labelled/vctrs_vctr/double>. Defaulting to continuous.

watersource <- ilg_bar(sl19keep$hv201, "Source of Drinking Water", "darkgreen")
watersource
## Don't know how to automatically pick scale for object of type
## <haven_labelled/vctrs_vctr/double>. Defaulting to continuous.

Pie Charts

# pie charts of asset ownership
elec <- sum(sl19keep$hv206 == 1)
radio <- sum(sl19keep$hv207 == 1)
tele <- sum(sl19keep$hv208 == 1)
fridge <- sum(sl19keep$hv209 == 1)
bicycle <- sum(sl19keep$hv210 == 1)
motorcycle <- sum(sl19keep$hv211 == 1)
car <- sum(sl19keep$hv212 == 1)

n <- nrow(sl19keep)

pie(c(elec, n-elec), labels = c("Yes", "No"), main="Have Electricity", col=c("purple","white"))

pie(c(radio, n-radio), labels = c("Yes", "No"), main="Have Radio", col=c("purple","white"))

pie(c(tele, n-tele), labels = c("Yes", "No"), main="Have Television", col=c("purple","white"))

pie(c(fridge, n-fridge), labels = c("Yes", "No"), main="Have Fridge", col=c("purple","white"))

pie(c(bicycle, n-bicycle), labels = c("Yes", "No"), main="Have Bicycle", col=c("purple","white"))

pie(c(motorcycle, n-motorcycle), labels = c("Yes", "No"), main="Have Motorcycle", col=c("purple","white"))

pie(c(car, n-car), labels = c("Yes", "No"), main="Have Car", col=c("purple","white"))

Multivariable Bar Chart

# bar chart of asset ownership
counts <- c(elec,radio,tele,fridge,bicycle,motorcycle,car)
labels <- c("Electricity", "Radio", "Television", "Fridge", "Bicycle", "Motorcyle", "Car")
assets <- data.frame(asset=labels, count=counts)
ggplot(assets) +
  geom_bar(aes(y=count, x=asset), fill="purple", stat='identity') +
  ggtitle('Asset Ownership')+
  xlab('Asset') +
  ylab('Count') +
  ylim(0,15000) + 
  theme_bw() +
  scale_color_brewer(palette="Dark2")+
  geom_hline(yintercept=n, style="dashed", color="gray")
## Warning in geom_hline(yintercept = n, style = "dashed", color = "gray"):
## Ignoring unknown parameters: `style`

Kriging/Variogram

# In this block I am projecting the cluster points to a WGS84 projection for analysis

sl19clean <- readRDS("./exports/sl19clean.rds")
df <- sl19clean

# project the cluster points
coordinates(df) = c('LONGNUM', 'LATNUM')
proj4string(df) = CRS("+proj=longlat +datum=WGS84")
plottable_df_projected<- spTransform(df, CRS("+proj=aea +lat_1=34 +lat_2=40.5 +lat_0=0 +lon_0=-120 +x_0=0 +y_0=-4000000 +ellps=WGS84 +datum=WGS84 +units=m +no_defs"))

saveRDS(plottable_df_projected, "./exports/plottable_df_projected.rds")
# In this block I am projecting the regions polygons to a WGS84 projection for analysis

regions <- readRDS("./exports/regions.rds")
# project the region boundaries
regions_oultine <-as(regions, "Spatial")

proj4string(regions_oultine) = CRS("+proj=longlat +datum=WGS84")
regions_projected<- spTransform(regions_oultine, CRS("+proj=aea +lat_1=34 +lat_2=40.5 +lat_0=0 +lon_0=-120 +x_0=0 +y_0=-4000000 +ellps=WGS84 +datum=WGS84 +units=m +no_defs"))

# In this next chunk I am making the grid for IDW and Kriging

# Making grid
kriging_grid <- SpatialPixels(SpatialPoints(makegrid(regions_projected, n=15000)), proj4string = proj4string(regions_projected))
# Applying grid to Sierra Leone
kriging_grid <- kriging_grid[regions_projected,]
plot(kriging_grid)

saveRDS(kriging_grid, "./exports/kriging_grid.rds")
saveRDS(regions_projected, "./exports/regions_projected.rds")
plottable_df_projected <- readRDS("./exports/plottable_df_projected.rds")
df <- plottable_df_projected

kriging_grid <- readRDS("./exports/kriging_grid.rds")

# Inverse distance weighting
hv271.idw <- idw(formula = hv271 ~ 1, df, kriging_grid, idp = 2, nmax = 100)
## [inverse distance weighted interpolation]
plot(hv271.idw)

#Plotting variogram for sample data
# sample variogram based on Cressie method, it is often used to account for outliers
p.vgm <- variogram(hv271 ~ 1, df, cressie=T)
plot(p.vgm, type='l')

# Reference block to see variogram model options
vgm()
##    short                                      long
## 1    Nug                              Nug (nugget)
## 2    Exp                         Exp (exponential)
## 3    Sph                           Sph (spherical)
## 4    Gau                            Gau (gaussian)
## 5    Exc        Exclass (Exponential class/stable)
## 6    Mat                              Mat (Matern)
## 7    Ste Mat (Matern, M. Stein's parameterization)
## 8    Cir                            Cir (circular)
## 9    Lin                              Lin (linear)
## 10   Bes                              Bes (bessel)
## 11   Pen                      Pen (pentaspherical)
## 12   Per                            Per (periodic)
## 13   Wav                                Wav (wave)
## 14   Hol                                Hol (hole)
## 15   Log                         Log (logarithmic)
## 16   Pow                               Pow (power)
## 17   Spl                              Spl (spline)
## 18   Leg                            Leg (Legendre)
## 19   Err                   Err (Measurement error)
## 20   Int                           Int (Intercept)
show.vgms()

# Initial estimates for nugget, sill, and range (4,300,000,000; 7,900,000,000; 60,000)

saveRDS(hv271.idw, "./exports/hv271_krig_plot.rds")

# Fitting sample variogram to a model

initModel = vgm(psill = 7900000000, "Sph", range = 60000, nugget = 4300000000)
p.fitSph <- fit.variogram(p.vgm, model = initModel)
plot(p.vgm, pch = 20, cex = 1.5, col = "black", ylab = "Semivariance", xlab = "Distance (m)", model = p.fitSph)

Model = vgm(psill = 7900000000, "Exp", range = 60000, nugget = 4300000000)
p.fitExp <- fit.variogram(p.vgm, model = Model)
plot(p.vgm, pch = 20, cex = 1.5, col = "black", ylab = "Semivariance", xlab = "Distance (m)", model = p.fitExp)

Model2 = vgm(psill = 7900000000, "Lin", range = 60000, nugget = 4300000000)
p.fitMat <- fit.variogram(p.vgm, model = Model2)
plot(p.vgm, pch = 20, cex = 1.5, col = "black", ylab = "Semivariance", xlab = "Distance (m)", model = p.fitMat)

# After trying a few variograms, it appears Spherical is the best model

# Kriging model

# This line below is to remove duplicate locations as described here: https://gis.stackexchange.com/questions/222192/r-gstat-krige-covariance-matrix-singular-at-location-5-88-47-4-0-skipping
df <- df[-zerodist(df)[,1],] 

krigSph <- krige(hv271 ~ 1, df, kriging_grid, model = p.fitSph)
## [using ordinary kriging]
plot(krigSph)

saveRDS(krigSph, "./exports/kriging_model1.rds")

# Evaluating accuracy with Cross Validation - commented out to run faster, but can be uncommented and run to evaluate

# cvKriging<-krige.cv(hv271~1, df, kriging_grid, model=p.fitSph)
# # get the RMSE
# rmse<-sqrt(mean(cvKriging$residual^2))
# rmse
# 
cvKriging<-krige.cv(hv271~1, df, kriging_grid, model=p.fitExp)
# get the RMSE
rmse2<-sqrt(mean(cvKriging$residual^2))
rmse2
## [1] 63120.57
# It turns ou the p.fitExp fits better than the p.fitSph so I am now going to make a kriging with p.fitExp

krigExp <- krige(hv271 ~ 1, df, kriging_grid, model = p.fitExp)
## [using ordinary kriging]
plot(krigExp, main="Spatial Inter")

saveRDS(krigSph, "./exports/better_kriging_model.rds")

Convert point data into area data for Spatial Regression

# reread in dataframe
sl19clean <- readRDS("./exports/sl19clean.rds")
df <- sl19clean


# define columns to keep
assetColumns <- c("hv206","hv207","hv208","hv209","hv210","hv211","hv212","hv221","hv227","hv243a","hv243b","hv243c","hv243d","hv243e","hv246a","hv246b","hv246c","hv246d","hv246e","hv246f","hv247", "hv271")
gpsColumns <- c("LATNUM", "LONGNUM", "ALT_DEM")
srColumns <- c(assetColumns, gpsColumns)


# filter big data frame to just columns needed for this
df <- df[,srColumns]

# convert all columns to numeric
df <- as.data.frame(lapply(df, as.numeric)) 

# add geometry back in
df$geometry <- sl19clean$geometry

# filter out "unknown" or "missing" values from survey
df <- na.omit(df) # remove NA values
df <- df %>% filter(
  hv246a < 95 &
  hv246b < 95 &
  hv246c < 95 &
  hv246d < 95 &
  hv246e < 95 &
  hv246f < 95
)


# remove geometry again
geometry <- df$geometry
df = select(df, -geometry)

# aggregate data to cluster level, keep means of other variables
df <- aggregate.data.frame(df, list(lat = df$LATNUM, lon = df$LONGNUM), mean)
sl19clusters <- df

# read in country and regions
regions <- readRDS("./exports/regions.rds")
country <- readRDS("./exports/country.rds")

# remove the islands from the polygon list
country_multi <- as.data.frame(st_cast(country$geometry, "POLYGON"))
country_multi$area_sqm <- st_area(country_multi$geometry)
sl.sf <- country_multi[1,"geometry"]

# convert the countries shapefile (sf) into SpatialPolygon (sp)
country.sp <- as(sl.sf, 'Spatial')

# create voronoi tesselation plot
ggplot(df) + 
  ggvoronoi::geom_voronoi(aes(lon, lat, fill=hv271), outline = country.sp) + 
  scale_fill_gradient(low="white", high="blue") +
  geom_sf(data=regions, fill=NA, color="black") +
  ggtitle("Voronoi Tesellation of Sierra Leone")
## Warning: GEOS support is provided by the sf and terra packages among others

## Warning: GEOS support is provided by the sf and terra packages among others
## Warning in gBuffer(outline_spdf, byid = TRUE, width = 0): Spatial object is not
## projected; GEOS expects planar coordinates

# create voronoi tesselation SpatialPolygon
areas.sp <- ggvoronoi::voronoi_polygon(df, x="lon", y="lat", outline = country.sp)
## Warning: GEOS support is provided by the sf and terra packages among others
## Warning: GEOS support is provided by the sf and terra packages among others
## Warning in gBuffer(outline_spdf, byid = TRUE, width = 0): Spatial object is not
## projected; GEOS expects planar coordinates
# convert voronoi SpatialPolygon to a shapefile
areas.sf <- sf::st_as_sf(areas.sp)

# # remove areas with multiple polygons
# areas.sf$n_polygons <- lapply(areas.sf$geometry, length)
# areas.sf <- filter(areas.sf, n_polygons == 1)

# convert list of polygons into individual polygons
# geometry <- areas.sf$geometry
# geometry2 <- st_cast(geometry, "POLYGON")
# areas.sf$geometry <- geometry2
# areas.sf$geometry3 <- lapply(areas.sf$geometry, unlist)

# plot results
plot(areas.sf)
## Warning: plotting the first 9 out of 27 attributes; use max.plot = 27 to plot
## all

ggplot(data=areas.sf) +
  geom_sf(aes(fill=hv271)) +
  scale_fill_gradient(low="white", high="darkgreen") +
  ggtitle("Mean Wealth Index across Sierra Leone ")

ggplot(data=areas.sf) +
  geom_sf(aes(fill=hv206*100)) +
  scale_fill_gradient(low="white", high="gold") +
  ggtitle("% with Electricity Access in Sierra Leone")

# map just areas and points
ggplot() +
  geom_sf(data=areas.sf) +
  geom_point(data=plottable_df, aes(x=LONGNUM, y=LATNUM), cex=0.5) +
  ggtitle("Thiessen Polygons of DHS 2019 Clusters in Sierra Leone")

cor(areas.sf$hv206, areas.sf$hv271, method = "pearson")
## [1] 0.8845982
# export area
saveRDS(areas.sf, "./exports/areas.rds")   
saveRDS(sl19clusters, "./exports/sl19clusters.rds")   

Linear Regression

Cut Down Dataframe to Numeric Columns

# import dataframe with shape file
sl19areas <- readRDS("./exports/areas.rds")

# keep just asset columns 
assetColumns <- c("hv206","hv207","hv208","hv209","hv210","hv211","hv212","hv221","hv227","hv243a","hv243b","hv243c","hv243d","hv243e","hv246a","hv246b","hv246c","hv246d","hv246e","hv246f","hv247", "hv271")
sl19num <- sl19areas[,assetColumns]

# remove rows that have no neighbors
df <- df[-c(47, 68), ]

# save down dataframe
saveRDS(sl19num, "./exports/sl19num.rds")

Basic Multiple Linear Regression

# read in numeric dataframe
sl19num <- readRDS("./exports/sl19num.rds")
df <- sl19num

# process df for lm
geometry <- df$geometry
df <- df %>% st_drop_geometry()

# do with no variables
lm.mean <- lm(hv271 ~ 1, data=df)
summary(lm.mean)
## 
## Call:
## lm(formula = hv271 ~ 1, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -110194  -70606  -42934   67121  229770 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)     1148       3781   0.304    0.762
## 
## Residual standard error: 88920 on 552 degrees of freedom
# mean = 623
rmse.lm.mean <- sqrt(mean(residuals(lm.mean)^2))
rmse.lm.mean # 88899.5
## [1] 88841.1
# map just areas and points
ggplot() +
  geom_sf(data=areas.sf, fill="darkgreen", color="black", alpha=0.5) +
  ggtitle("Null Model of Wealth Index across Sierra Leone") +
  theme_void()

# basic linear model
lm.basic <- lm(hv271 ~ ., data=df)
summary(lm.basic)
## 
## Call:
## lm(formula = hv271 ~ ., data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -72590 -13545    -17  12458  78689 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -85372.6     4999.4 -17.076  < 2e-16 ***
## hv206         40871.1    12853.5   3.180 0.001560 ** 
## hv207         36356.4     6995.2   5.197 2.89e-07 ***
## hv208         56968.7    18013.3   3.163 0.001653 ** 
## hv209         24710.2    17751.9   1.392 0.164514    
## hv210         46779.4    15788.9   2.963 0.003185 ** 
## hv211           208.9    11527.8   0.018 0.985549    
## hv212        138908.1    24755.0   5.611 3.24e-08 ***
## hv221         79367.9    63907.3   1.242 0.214814    
## hv227        -37549.5     5276.9  -7.116 3.62e-12 ***
## hv243a        99366.5     6433.6  15.445  < 2e-16 ***
## hv243b         7346.1     6346.4   1.158 0.247577    
## hv243c      -171181.0   104039.6  -1.645 0.100490    
## hv243d       -57250.1    17030.5  -3.362 0.000831 ***
## hv243e        17504.0    24117.0   0.726 0.468283    
## hv246a          610.1     3096.3   0.197 0.843867    
## hv246b          516.6     1112.0   0.465 0.642401    
## hv246c        34105.3    13185.4   2.587 0.009958 ** 
## hv246d       -11901.4     1877.1  -6.340 4.91e-10 ***
## hv246e         3365.4     2950.5   1.141 0.254550    
## hv246f        -2818.3      452.5  -6.228 9.60e-10 ***
## hv247         71281.3    10129.8   7.037 6.10e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20480 on 531 degrees of freedom
## Multiple R-squared:  0.949,  Adjusted R-squared:  0.947 
## F-statistic: 470.4 on 21 and 531 DF,  p-value: < 2.2e-16
rmse.lm.basic <- sqrt(mean(residuals(lm.basic)^2))
rmse.lm.basic # 19785.72
## [1] 20066.05
# adjusted R-squared is 0.9484
# r-squared is 0.9505

# define neighbors & weights
queen_neighbors <- spdep::poly2nb(geometry)
queen_weights <- spdep::nb2listw(queen_neighbors, zero.policy = TRUE)
spdep::lm.morantest(lm.basic, queen_weights)
## 
##  Global Moran I for regression residuals
## 
## data:  
## model: lm(formula = hv271 ~ ., data = df)
## weights: queen_weights
## 
## Moran I statistic standard deviate = 8.6545, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##     0.2115698419    -0.0089133777     0.0006490268
# try cross validation
set.seed(44)
df <- sl19num
geometry <- df$geometry
df <- df %>% st_drop_geometry()
cv <- trainControl(method = 'cv', number = 5)
lm.cv <- train(hv271 ~ ., data=df, method="lm", trControl = cv)
print(lm.cv) # RMSE 21674
## Linear Regression 
## 
## 553 samples
##  21 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 443, 441, 442, 444, 442 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   21964.89  0.9389016  16959.22
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
rmse.lm.cv <- sqrt(mean(residuals(lm.cv)^2))
rmse.lm.cv # 19785
## [1] 20066.05
# try again without variables p > 0.5
sigColumns <- c("hv206","hv207","hv208","hv209","hv210","hv212","hv221","hv227","hv243a","hv243b","hv243c","hv243d","hv243e","hv246c","hv246d","hv246e","hv246f","hv247", "hv271")
df <- df[,sigColumns]
lm.improved <- lm(hv271 ~ ., data=df)
summary(lm.improved)
## 
## Call:
## lm(formula = hv271 ~ ., data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -72583 -13485   -108  12712  78892 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -85548.0     4887.0 -17.505  < 2e-16 ***
## hv206         40951.4    12741.0   3.214 0.001387 ** 
## hv207         35555.6     6808.6   5.222 2.54e-07 ***
## hv208         56674.1    17938.5   3.159 0.001671 ** 
## hv209         24951.5    17640.1   1.414 0.157805    
## hv210         46984.3    15498.0   3.032 0.002550 ** 
## hv212        138868.5    24685.3   5.626 2.99e-08 ***
## hv221         79086.8    63730.7   1.241 0.215168    
## hv227        -37635.1     5184.3  -7.259 1.38e-12 ***
## hv243a        99825.5     6241.3  15.994  < 2e-16 ***
## hv243b         8014.8     6173.8   1.298 0.194781    
## hv243c      -172266.3   103151.8  -1.670 0.095500 .  
## hv243d       -57425.1    16758.9  -3.427 0.000658 ***
## hv243e        17334.9    23992.6   0.723 0.470297    
## hv246c        35117.6    12827.0   2.738 0.006392 ** 
## hv246d       -11617.2     1780.8  -6.524 1.59e-10 ***
## hv246e         3677.8     2815.4   1.306 0.192019    
## hv246f        -2826.9      450.9  -6.270 7.46e-10 ***
## hv247         71194.5    10019.5   7.106 3.85e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20430 on 534 degrees of freedom
## Multiple R-squared:  0.949,  Adjusted R-squared:  0.9472 
## F-statistic: 551.5 on 18 and 534 DF,  p-value: < 2.2e-16
# adjusted R-squared goes up to 0.9487
# r-squared stays up at 0.9504

# try again without variables p > 0.05
vsigColumns <- c("hv206","hv207","hv208","hv210","hv212","hv227","hv243a","hv243d","hv246c","hv246d","hv246f","hv247", "hv271")
df <- df[,vsigColumns]
lm.vimproved <- lm(hv271 ~ ., data=df)
summary(lm.vimproved)
## 
## Call:
## lm(formula = hv271 ~ ., data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -71626 -14226    142  12286  77517 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -85582.5     4860.5 -17.608  < 2e-16 ***
## hv206        38801.0    12403.5   3.128 0.001854 ** 
## hv207        36584.1     6540.0   5.594 3.53e-08 ***
## hv208        78537.2    15344.4   5.118 4.29e-07 ***
## hv210        50220.9    15144.3   3.316 0.000974 ***
## hv212       163418.5    20706.1   7.892 1.66e-14 ***
## hv227       -36968.6     5170.4  -7.150 2.83e-12 ***
## hv243a      101483.5     6151.2  16.498  < 2e-16 ***
## hv243d      -60706.9    16682.1  -3.639 0.000300 ***
## hv246c       41122.6    12164.6   3.381 0.000776 ***
## hv246d      -10197.8     1521.0  -6.704 5.10e-11 ***
## hv246f       -2776.5      450.6  -6.162 1.40e-09 ***
## hv247        74367.4     9725.6   7.647 9.51e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20560 on 540 degrees of freedom
## Multiple R-squared:  0.9477, Adjusted R-squared:  0.9466 
## F-statistic: 815.7 on 12 and 540 DF,  p-value: < 2.2e-16
rmse.lm.vimproved <- sqrt(mean(residuals(lm.vimproved)^2))
rmse.lm.vimproved # 20049.49
## [1] 20313.64
# adjusted R-squared goes up to 0.9517
# r-squared goes up at 0.9531!
# much better model

# try cross validation again with smaller model
set.seed(44)
vsigColumns <- c("hv206","hv207","hv208","hv210","hv212","hv227","hv243a","hv243d","hv246c","hv246d","hv246f","hv247", "hv271")
df <- df[,vsigColumns]
lm.cv.improved <- train(hv271 ~ ., data=df, method="lm", trControl = cv)
print(lm.cv.improved) # RMSE 20902
## Linear Regression 
## 
## 553 samples
##  12 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 443, 441, 442, 444, 442 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   21020.78  0.9445597  16568.09
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
rmse.lm.cv.improved <- sqrt(mean(residuals(lm.cv.improved)^2))
rmse.lm.cv.improved # 20049.49
## [1] 20313.64
# map null model
ggplot() +
  geom_sf(data=areas.sf, fill="darkgreen", color="black", alpha=0.5) +
  ggtitle("Null Model of Wealth Index across Sierra Leone")

# map null model residuals
df$geometry <- geometry
df$residuals <- residuals(lm.mean)
ggplot(df) +
  geom_sf(aes(fill=residuals, geometry=geometry), color="transparent") +
  scale_fill_gradient2() +
  ggtitle("Map of Null Model Residuals")

# map residuals
df$geometry <- geometry
df$residuals <- residuals(lm.basic)
ggplot(df) +
  geom_sf(aes(fill=residuals, geometry=geometry), color="transparent") +
  scale_fill_gradient2() +
  ggtitle("Map of Linear Model Residuals")

# histogram of residuals
ggplot(df) +
  geom_histogram(aes(residuals), fill="darkgreen", bins = 20)

# save down significant columns df
df$geometry <- geometry
sl19lm <- df
saveRDS(sl19lm, "./exports/sl19lm.rds")

Linear Regression with a Train/Test Split

# read in data
sl19num <- readRDS("./exports/sl19num.rds")
df <- sl19num

# split train / test data
f = 4/5
n = nrow(df)
set.seed(44)
train <- sample(n, n*f, replace=FALSE)
traindf <- df[train,]
testdf <- df[-train,]

# prep for lm
geometry <- testdf$geometry
traindf <- traindf %>% st_drop_geometry()
testdf <- testdf %>% st_drop_geometry()
df <- traindf

# run lm
lm.train <- lm(hv271 ~ ., data=df)
summary(lm.train)
## 
## Call:
## lm(formula = hv271 ~ ., data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -69095 -14052  -1130  11336  76074 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -87027.38    5585.93 -15.580  < 2e-16 ***
## hv206         35055.57   14888.17   2.355 0.019002 *  
## hv207         32463.10    7943.36   4.087 5.24e-05 ***
## hv208         73026.58   20767.20   3.516 0.000485 ***
## hv209         14419.92   20050.49   0.719 0.472430    
## hv210         43473.13   17847.42   2.436 0.015273 *  
## hv211         -8287.26   13130.91  -0.631 0.528302    
## hv212        156336.70   27863.84   5.611 3.66e-08 ***
## hv221         92167.96   69265.86   1.331 0.184029    
## hv227        -34306.70    5974.95  -5.742 1.80e-08 ***
## hv243a       100099.22    7221.33  13.862  < 2e-16 ***
## hv243b         6339.94    7235.74   0.876 0.381423    
## hv243c      -156781.23  116386.80  -1.347 0.178684    
## hv243d       -47068.43   18763.53  -2.509 0.012500 *  
## hv243e        -7619.45   26637.00  -0.286 0.774983    
## hv246a          -53.72    3272.63  -0.016 0.986910    
## hv246b          418.30    1156.62   0.362 0.717789    
## hv246c       107440.34   48190.51   2.229 0.026309 *  
## hv246d       -13013.17    2317.69  -5.615 3.58e-08 ***
## hv246e         7240.95    3488.86   2.075 0.038553 *  
## hv246f        -2688.34     513.13  -5.239 2.56e-07 ***
## hv247         76855.74   12023.27   6.392 4.35e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20600 on 420 degrees of freedom
## Multiple R-squared:   0.95,  Adjusted R-squared:  0.9475 
## F-statistic: 379.9 on 21 and 420 DF,  p-value: < 2.2e-16
pred.lm <- predict(object = lm.train, newdata = testdf)
rmse.lm <- rmse(actual = testdf$hv271, predicted = pred.lm )
print(rmse.lm) # RMSE 22393 (worse than CV)
## [1] 24201.98
# try a test/train split again with fewer columns
vsigColumns <- c("hv206","hv207","hv208","hv210","hv212","hv227","hv243a","hv243d","hv246c","hv246d","hv246f","hv247", "hv271")
df <- traindf[,vsigColumns]
testdf <- testdf[,vsigColumns]
lm.train.improved <- lm(hv271 ~ ., data=df)
summary(lm.train.improved) # R2 0.9531
## 
## Call:
## lm(formula = hv271 ~ ., data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -70425 -14199  -1060  12158  75068 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -87112.2     5420.4 -16.071  < 2e-16 ***
## hv206        34706.0    14339.1   2.420   0.0159 *  
## hv207        31768.7     7484.6   4.245 2.69e-05 ***
## hv208        84842.3    17248.9   4.919 1.24e-06 ***
## hv210        43422.7    17219.8   2.522   0.0120 *  
## hv212       167087.5    22586.1   7.398 7.35e-13 ***
## hv227       -34614.7     5798.4  -5.970 4.99e-09 ***
## hv243a      102425.8     6886.6  14.873  < 2e-16 ***
## hv243d      -47348.7    18403.9  -2.573   0.0104 *  
## hv246c      112947.9    47465.8   2.380   0.0178 *  
## hv246d       -9791.0     1798.0  -5.445 8.72e-08 ***
## hv246f       -2562.7      511.4  -5.011 7.93e-07 ***
## hv247        75569.4    11505.6   6.568 1.48e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20670 on 429 degrees of freedom
## Multiple R-squared:  0.9486, Adjusted R-squared:  0.9471 
## F-statistic: 659.4 on 12 and 429 DF,  p-value: < 2.2e-16
pred.lm.improved <- predict(object = lm.train.improved, newdata = testdf)
rmse.lm.improved <- rmse(actual = testdf$hv271, predicted = pred.lm.improved )
print(rmse.lm.improved) # RMSE 22619 (not better)
## [1] 23592.15
# try a test/train split with few columns and cross validation
df <- traindf[,vsigColumns]
testdf <- testdf[,vsigColumns]
cv <- trainControl(method = 'cv', number = 5)
lm.cv.train <- train(hv271 ~ ., data=df, method="lm", trControl = cv)
print(lm.cv.train) # RMSE 20651, R-squared 0.9487
## Linear Regression 
## 
## 442 samples
##  12 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 354, 354, 353, 354, 353 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   20867.56  0.9454514  16357.48
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
pred.lm.cv.train <- predict(object = lm.cv.train, newdata = testdf)
rmse.lm.cv.train <- rmse(actual = testdf$hv271, predicted = pred.lm.cv.train )
print(rmse.lm.cv.train) # RMSE 22619.46 (not better - exactly the same as 80/20)
## [1] 23592.15

Spatial Regression

Spatial Lag Model

# get same data as for regression models
sl19num <- readRDS("./exports/sl19num.rds")
df <- sl19num
geometry <- df$geometry

# define neighbors & weights
queen_neighbors <- spdep::poly2nb(geometry)
queen_weights <- spdep::nb2listw(queen_neighbors, zero.policy = TRUE)

# null spatial lag model
df <- df %>% st_drop_geometry()
lm.lag.null <- lagsarlm(hv271 ~ 1, data=df, queen_weights, zero.policy = TRUE)
## Warning in lagsarlm(hv271 ~ 1, data = df, queen_weights, zero.policy = TRUE): inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16 
##   reciprocal condition number = 3.09995e-20 - using numerical Hessian.
summary(lm.lag.null)
## 
## Call:lagsarlm(formula = hv271 ~ 1, data = df, listw = queen_weights, 
##     zero.policy = TRUE)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -135748.5  -30642.3   -8229.1   22728.4  181907.1 
## 
## Type: lag 
## Coefficients: (numerical Hessian approximate standard errors) 
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   899.37    1805.88   0.498   0.6185
## 
## Rho: 0.85025, LR test value: 623.81, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 0.020406
##     z-value: 41.667, p-value: < 2.22e-16
## Wald statistic: 1736.1, p-value: < 2.22e-16
## 
## Log likelihood: -6773.982 for lag model
## ML residual variance (sigma squared): 2082300000, (sigma: 45633)
## Number of observations: 553 
## Number of parameters estimated: 3 
## AIC: 13554, (AIC for lm: 14176)
rmse.lm.lag.null <- sqrt(mean(residuals(lm.lag.null)^2))
rmse.lm.lag.null # 45131
## [1] 45632.65
# spatial lag model
lm.lag <- lagsarlm(hv271 ~ ., data=df, queen_weights, zero.policy = TRUE)
## Warning in lagsarlm(hv271 ~ ., data = df, queen_weights, zero.policy = TRUE): inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16 
##   reciprocal condition number = 2.2272e-19 - using numerical Hessian.
## Warning in sqrt(diag(fdHess)[-1]): NaNs produced
summary(lm.lag)
## 
## Call:lagsarlm(formula = hv271 ~ ., data = df, listw = queen_weights, 
##     zero.policy = TRUE)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -61655.84 -13567.67   -390.47  11843.22  74602.26 
## 
## Type: lag 
## Coefficients: (numerical Hessian approximate standard errors) 
##                Estimate  Std. Error  z value  Pr(>|z|)
## (Intercept)  -79637.722    4597.905 -17.3204 < 2.2e-16
## hv206         35222.625   11267.725   3.1260  0.001772
## hv207         32023.061    6249.282   5.1243 2.987e-07
## hv208         34813.509   17214.024   2.0224  0.043136
## hv209         12328.026   14030.529   0.8787  0.379587
## hv210         39684.103   13571.732   2.9240  0.003455
## hv211         17126.059   10376.510   1.6505  0.098848
## hv212        136616.393   22624.041   6.0385 1.555e-09
## hv221         57276.780   57173.736   1.0018  0.316439
## hv227        -27130.154    4825.654  -5.6221 1.887e-08
## hv243a        84287.480    6108.230  13.7990 < 2.2e-16
## hv243b         8702.867    4638.643   1.8762  0.060632
## hv243c      -159340.756   96454.466  -1.6520  0.098539
## hv243d       -44894.176   15730.744  -2.8539  0.004318
## hv243e        32089.657   20878.524   1.5370  0.124301
## hv246a         -259.343         NaN      NaN       NaN
## hv246b           11.338         NaN      NaN       NaN
## hv246c        29966.902    9492.951   3.1568  0.001595
## hv246d       -10072.715    1587.216  -6.3462 2.208e-10
## hv246e         3620.080    2040.447   1.7742  0.076037
## hv246f        -2520.584     411.986  -6.1181 9.468e-10
## hv247         78276.090    9322.626   8.3964 < 2.2e-16
## 
## Rho: 0.18349, LR test value: 64.899, p-value: 7.7716e-16
## Approximate (numerical Hessian) standard error: 0.021884
##     z-value: 8.3846, p-value: < 2.22e-16
## Wald statistic: 70.302, p-value: < 2.22e-16
## 
## Log likelihood: -6230.676 for lag model
## ML residual variance (sigma squared): 355790000, (sigma: 18862)
## Number of observations: 553 
## Number of parameters estimated: 24 
## AIC: 12509, (AIC for lm: 12572)
# AIC 12040 instead of 12104 for lm
# ro = 0.1864, p-value << 0.05

# pull out significant columns from spatial model
spSigColumns <- c("hv206","hv207","hv208","hv210","hv212","hv227","hv243a","hv243d","hv246c","hv246d","hv246f","hv247", "hv271")
# same as vsigColumns from non-spatial model

# rerun model with fewer columns
df <- df[,spSigColumns]
lm.lag.improved <- lagsarlm(hv271 ~ ., data=df, queen_weights, zero.policy = TRUE)
## Warning in lagsarlm(hv271 ~ ., data = df, queen_weights, zero.policy = TRUE): inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16 
##   reciprocal condition number = 2.16936e-19 - using numerical Hessian.
summary(lm.lag.improved)
## 
## Call:lagsarlm(formula = hv271 ~ ., data = df, listw = queen_weights, 
##     zero.policy = TRUE)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -60135.0 -14010.9  -1344.4  11931.0  73014.8 
## 
## Type: lag 
## Coefficients: (numerical Hessian approximate standard errors) 
##              Estimate Std. Error  z value  Pr(>|z|)
## (Intercept) -81085.38    4559.35 -17.7844 < 2.2e-16
## hv206        29426.44   11643.84   2.5272 0.0114972
## hv207        34465.04    6100.06   5.6499 1.605e-08
## hv208        55376.63   14600.72   3.7927 0.0001490
## hv210        49730.85   14125.72   3.5206 0.0004306
## hv212       159744.17   19277.61   8.2865 2.220e-16
## hv227       -25589.95    5012.70  -5.1050 3.308e-07
## hv243a       88543.80    5936.76  14.9145 < 2.2e-16
## hv243d      -52392.18   15545.11  -3.3703 0.0007508
## hv246c       37277.61   11338.89   3.2876 0.0010105
## hv246d       -8424.34    1432.49  -5.8809 4.080e-09
## hv246f       -2507.38     420.91  -5.9570 2.569e-09
## hv247        82821.39    9112.14   9.0891 < 2.2e-16
## 
## Rho: 0.17731, LR test value: 62.742, p-value: 2.3315e-15
## Approximate (numerical Hessian) standard error: 0.021657
##     z-value: 8.1871, p-value: 2.2204e-16
## Wald statistic: 67.028, p-value: 2.2204e-16
## 
## Log likelihood: -6238.535 for lag model
## ML residual variance (sigma squared): 366200000, (sigma: 19136)
## Number of observations: 553 
## Number of parameters estimated: 15 
## AIC: 12507, (AIC for lm: 12568)
rmse.lm.lag.improved <- sqrt(mean(residuals(lm.lag.improved)^2))
rmse.lm.lag.improved # 18807.81
## [1] 19136.45
# AIC 12038 instead of 12100 for lm
# ro = 0.18041, p-value << 0.05

# map residuals
df$geometry <- geometry
df$residuals <- residuals(lm.lag.improved)
ggplot(df) +
  geom_sf(aes(fill=residuals, geometry=geometry), color="transparent") +
  scale_fill_gradient2() +
  ggtitle("Map of Spatial Lag Model Residuals")

# histogram of residuals
ggplot(df) +
  geom_histogram(aes(residuals), fill="purple", bins = 15)

Spatial Error Model

# get same data as for regression models
sl19num <- readRDS("./exports/sl19num.rds")
df <- sl19num
geometry <- df$geometry
df <- st_drop_geometry(df)

# define neighbors & weights
queen_neighbors <- spdep::poly2nb(geometry)
queen_weights <- spdep::nb2listw(queen_neighbors, zero.policy = TRUE)

# try spatial error model
lm.err <- errorsarlm(hv271 ~ ., data=df, queen_weights, zero.policy = TRUE)
## Warning in errorsarlm(hv271 ~ ., data = df, queen_weights, zero.policy = TRUE): inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16 
##   reciprocal condition number = 5.38187e-18 - using numerical Hessian.
summary(lm.err)
## 
## Call:errorsarlm(formula = hv271 ~ ., data = df, listw = queen_weights, 
##     zero.policy = TRUE)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -60639.7 -12600.8  -1148.3  11869.7  64050.2 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##               Estimate Std. Error  z value  Pr(>|z|)
## (Intercept)  -84370.11    5078.74 -16.6124 < 2.2e-16
## hv206         49716.46   11724.72   4.2403 2.232e-05
## hv207         28237.25    6471.72   4.3632 1.282e-05
## hv208         58050.38   16494.38   3.5194 0.0004325
## hv209         20672.88   15662.18   1.3199 0.1868606
## hv210         35733.73   14387.61   2.4836 0.0130045
## hv211         22711.01   10341.79   2.1960 0.0280888
## hv212        110421.83   22522.82   4.9027 9.454e-07
## hv221         36088.67   56531.09   0.6384 0.5232223
## hv227        -25031.13    5385.39  -4.6480 3.352e-06
## hv243a        83401.67    6222.69  13.4028 < 2.2e-16
## hv243b        11192.76    6390.94   1.7513 0.0798861
## hv243c      -229653.33   91738.11  -2.5034 0.0123021
## hv243d       -66884.89   18074.79  -3.7005 0.0002152
## hv243e        25546.14   20979.65   1.2177 0.2233522
## hv246a         1059.11    2779.37   0.3811 0.7031569
## hv246b         1079.65    1045.09   1.0331 0.3015738
## hv246c        39743.94   11213.64   3.5442 0.0003937
## hv246d       -10596.08    1712.59  -6.1872 6.126e-10
## hv246e          254.10    2752.53   0.0923 0.9264469
## hv246f        -2332.01     423.46  -5.5070 3.650e-08
## hv247         71971.69    9398.45   7.6578 1.887e-14
## 
## Lambda: 0.52797, LR test value: 70.433, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 0.052836
##     z-value: 9.9927, p-value: < 2.22e-16
## Wald statistic: 99.853, p-value: < 2.22e-16
## 
## Log likelihood: -6227.908 for error model
## ML residual variance (sigma squared): 333640000, (sigma: 18266)
## Number of observations: 553 
## Number of parameters estimated: 24 
## AIC: 12504, (AIC for lm: 12572)
# AIC 12030 instead of 12104 for lm
# lambda = 0.53617, p-value << 0.05

# pull out significant columns from spatial model
spSigColumns <- c("hv206","hv207","hv208","hv210","hv212","hv227","hv243a","hv243d","hv246c","hv246d","hv246f","hv247", "hv271")
# same as vsigColumns from non-spatial model

# rerun model with fewer columns
df <- df[,spSigColumns]
lm.err.improved <- errorsarlm(hv271 ~ ., data=df, queen_weights, zero.policy = TRUE)
## Warning in errorsarlm(hv271 ~ ., data = df, queen_weights, zero.policy = TRUE): inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16 
##   reciprocal condition number = 5.48439e-18 - using numerical Hessian.
summary(lm.err.improved)
## 
## Call:errorsarlm(formula = hv271 ~ ., data = df, listw = queen_weights, 
##     zero.policy = TRUE)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -60613.58 -13156.52   -413.76  11717.17  65445.18 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##              Estimate Std. Error  z value  Pr(>|z|)
## (Intercept) -85801.72    5039.91 -17.0244 < 2.2e-16
## hv206        45994.37   11726.71   3.9222 8.775e-05
## hv207        31630.05    6220.46   5.0848 3.679e-07
## hv208        77151.88   14939.90   5.1641 2.415e-07
## hv210        43544.05   14207.27   3.0649  0.002177
## hv212       133253.86   18932.13   7.0385 1.943e-12
## hv227       -24403.04    5394.83  -4.5234 6.085e-06
## hv243a       89899.05    6020.66  14.9318 < 2.2e-16
## hv243d      -76285.03   17926.24  -4.2555 2.086e-05
## hv246c       42115.96   10629.97   3.9620 7.432e-05
## hv246d       -9963.92    1540.35  -6.4686 9.890e-11
## hv246f       -2373.07     429.99  -5.5189 3.412e-08
## hv247        79620.56    9113.82   8.7362 < 2.2e-16
## 
## Lambda: 0.48308, LR test value: 63.831, p-value: 1.3323e-15
## Approximate (numerical Hessian) standard error: 0.053024
##     z-value: 9.1105, p-value: < 2.22e-16
## Wald statistic: 83.002, p-value: < 2.22e-16
## 
## Log likelihood: -6237.991 for error model
## ML residual variance (sigma squared): 349880000, (sigma: 18705)
## Number of observations: 553 
## Number of parameters estimated: 15 
## AIC: 12506, (AIC for lm: 12568)
rmse.lm.err.improved <- sqrt(mean(residuals(lm.err.improved)^2))
rmse.lm.err.improved # 18228.15
## [1] 18704.97
# AIC 12030 instead of 12100 for lm
# lamda = 0.5018, p-value << 0.05

# map residuals
df$geometry <- geometry
df$residuals <- residuals(lm.err.improved)
ggplot(df) +
  geom_sf(aes(fill=residuals, geometry=geometry), color="transparent") +
  scale_fill_gradient2() +
  ggtitle("Map of Spatial Error Model Residuals")

# histogram of residuals
ggplot(df) +
  geom_histogram(aes(residuals), fill="blue", bins = 15)

### Run Significance Tests

# read in lm dataframe
sl19lm <- readRDS("./exports/sl19lm.rds")
df <- sl19lm
df <- df[-c(47, 68), ] # try removing regions with no neighbors
geometry <- df$geometry
df = select(df, -geometry)

# define neighbors & weights
queen_neighbors <- spdep::poly2nb(geometry)
queen_weights <- spdep::nb2listw(queen_neighbors, zero.policy = TRUE)

# basic linear model
lm.basic <- lm(hv271 ~ ., data=df)

# run lm tests
lmtests <- lm.LMtests(lm.basic, listw=queen_weights, test="all")
summary(lmtests)
##  Lagrange multiplier diagnostics for spatial dependence
## data:  
## model: lm(formula = hv271 ~ ., data = df)
## weights: queen_weights
##  
##        statistic parameter   p.value    
## LMerr    12.5857         1 0.0003887 ***
## LMlag     2.1162         1 0.1457512    
## RLMerr   12.0503         1 0.0005178 ***
## RLMlag    1.5808         1 0.2086464    
## SARMA    14.1665         2 0.0008391 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## all very significant! 

# statistic parameter      p.value    
# LMerr     77.846         1 < 2.2e-16 ***
# LMlag     65.565         1 5.551e-16 ***
# RLMerr    41.468         1 1.198e-10 ***
# RLMlag    29.186         1 6.574e-08 ***

# RLMerr has a p-value 600X smaller than RLM lag, so we'll go with the spatial error model!
# I guess this tells us that there's sysStematic bias in the errors that we're not accounting for. Likely from ommitted variables....probably the categorical variables that I wasn't able to include.

Geographically Weighted Regression

# import sf dataframe 
sl19num <- readRDS("./exports/sl19num.rds")
df <- sl19num

# cut down to significant columns
spSigColumns <- c("hv206","hv207","hv208","hv210","hv212","hv227","hv243a","hv243d","hv246c","hv246d","hv246f","hv247", "hv271")
df <- df[,spSigColumns]

# convert to a spatial polygon df
df.sp <- as(df, "Spatial")

# find the right bandwidth for gwr
bw <- bw.gwr(hv271 ~ ., data = df.sp, adaptive=T)
## Adaptive bandwidth: 349 CV score: 2.19461e+11 
## Adaptive bandwidth: 224 CV score: 228092871703 
## Adaptive bandwidth: 428 CV score: 217558175915 
## Adaptive bandwidth: 475 CV score: 218963482933 
## Adaptive bandwidth: 397 CV score: 216315109381 
## Adaptive bandwidth: 379 CV score: 217028630754 
## Adaptive bandwidth: 409 CV score: 216915508517 
## Adaptive bandwidth: 390 CV score: 216187113706 
## Adaptive bandwidth: 385 CV score: 216240570769 
## Adaptive bandwidth: 392 CV score: 216414476082 
## Adaptive bandwidth: 387 CV score: 2.16179e+11 
## Adaptive bandwidth: 387 CV score: 2.16179e+11
bw # 385
## [1] 387
# run gwr
lm.gwr <- gwr.basic(hv271 ~ ., data=df.sp, bw=bw, kernel="gaussian", adaptive=TRUE)
lm.gwr
##    ***********************************************************************
##    *                       Package   GWmodel                             *
##    ***********************************************************************
##    Program starts at: 2024-03-03 20:42:10 
##    Call:
##    gwr.basic(formula = hv271 ~ ., data = df.sp, bw = bw, kernel = "gaussian", 
##     adaptive = TRUE)
## 
##    Dependent (y) variable:  hv271
##    Independent variables:  .
##    Number of data points: 553
##    ***********************************************************************
##    *                    Results of Global Regression                     *
##    ***********************************************************************
## 
##    Call:
##     lm(formula = formula, data = data)
## 
##    Residuals:
##    Min     1Q Median     3Q    Max 
## -71626 -14226    142  12286  77517 
## 
##    Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
##    (Intercept) -85582.5     4860.5 -17.608  < 2e-16 ***
##    hv206        38801.0    12403.5   3.128 0.001854 ** 
##    hv207        36584.1     6540.0   5.594 3.53e-08 ***
##    hv208        78537.2    15344.4   5.118 4.29e-07 ***
##    hv210        50220.9    15144.3   3.316 0.000974 ***
##    hv212       163418.5    20706.1   7.892 1.66e-14 ***
##    hv227       -36968.6     5170.4  -7.150 2.83e-12 ***
##    hv243a      101483.5     6151.2  16.498  < 2e-16 ***
##    hv243d      -60706.9    16682.1  -3.639 0.000300 ***
##    hv246c       41122.6    12164.6   3.381 0.000776 ***
##    hv246d      -10197.8     1521.0  -6.704 5.10e-11 ***
##    hv246f       -2776.5      450.6  -6.162 1.40e-09 ***
##    hv247        74367.4     9725.6   7.647 9.51e-14 ***
## 
##    ---Significance stars
##    Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
##    Residual standard error: 20560 on 540 degrees of freedom
##    Multiple R-squared: 0.9477
##    Adjusted R-squared: 0.9466 
##    F-statistic: 815.7 on 12 and 540 DF,  p-value: < 2.2e-16 
##    ***Extra Diagnostic information
##    Residual sum of squares: 228192061099
##    Sigma(hat): 20350.47
##    AIC:  12567.81
##    AICc:  12568.59
##    BIC:  12163.64
##    ***********************************************************************
##    *          Results of Geographically Weighted Regression              *
##    ***********************************************************************
## 
##    *********************Model calibration information*********************
##    Kernel function: gaussian 
##    Adaptive bandwidth: 387 (number of nearest neighbours)
##    Regression points: the same locations as observations are used.
##    Distance metric: Euclidean distance metric is used.
## 
##    ****************Summary of GWR coefficient estimates:******************
##                  Min.  1st Qu.   Median  3rd Qu.     Max.
##    Intercept -90492.7 -88417.6 -85855.7 -82925.8 -81703.5
##    hv206      33103.6  34079.4  38367.3  42248.7  45783.0
##    hv207      34832.4  35288.4  35513.6  36139.2  36830.4
##    hv208      75002.5  77148.8  79359.3  79868.9  80604.3
##    hv210      45836.3  46893.1  49596.8  53661.8  54412.5
##    hv212     154268.1 158197.6 165619.3 174422.8 181273.6
##    hv227     -42691.9 -41208.9 -36599.9 -32925.4 -29711.8
##    hv243a     95410.0  97891.9 100645.9 105908.9 107147.8
##    hv243d    -62528.1 -61565.8 -61222.3 -60670.4 -58660.5
##    hv246c     37837.3  40219.5  43696.4  46476.7  50121.3
##    hv246d    -12768.4 -12186.7 -10699.3  -9601.4  -8840.6
##    hv246f     -2841.2  -2780.1  -2668.3  -2568.6  -2469.7
##    hv247      72763.5  73248.8  75549.1  77002.0  77868.4
##    ************************Diagnostic information*************************
##    Number of data points: 553 
##    Effective number of parameters (2trace(S) - trace(S'S)): 19.60113 
##    Effective degrees of freedom (n-2trace(S) + trace(S'S)): 533.3989 
##    AICc (GWR book, Fotheringham, et al. 2002, p. 61, eq 2.33): 12538.8 
##    AIC (GWR book, Fotheringham, et al. 2002,GWR p. 96, eq. 4.22): 12518.97 
##    BIC (GWR book, Fotheringham, et al. 2002,GWR p. 61, eq. 2.34): 12054.22 
##    Residual sum of squares: 213252104866 
##    R-square value:  0.9511415 
##    Adjusted R-square value:  0.9493427 
## 
##    ***********************************************************************
##    Program stops at: 2024-03-03 20:42:10
# Map GWR Coefficients Across the Country

# electricity
df$gwr_hv206 <- lm.gwr$SDF$hv206
ggplot(data=df) +
    geom_sf(aes(fill=gwr_hv206)) +
    scale_fill_gradient2() +
    ggtitle("GWR Electricity Coefficients")

# this means that electricity is a better predictor of wealth the farther we get from Freetown

# radio
df$gwr_hv207 <- lm.gwr$SDF$hv207
ggplot(data=df) +
    geom_sf(aes(fill=gwr_hv207)) +
    scale_fill_gradient2() +
    ggtitle("GWR Radio Coefficients")

# tv
df$gwr_hv208 <- lm.gwr$SDF$hv208
ggplot(data=df) +
    geom_sf(aes(fill=gwr_hv208)) +
    scale_fill_gradient2() +
    ggtitle("GWR TV Coefficients")

# mobile phone
df$gwr_hv243a <- lm.gwr$SDF$hv243a
ggplot(data=df) +
    geom_sf(aes(fill=gwr_hv243a)) +
    scale_fill_gradient2() +
    ggtitle("GWR Mobile Phone Coefficients")

# horses / donkeys / mules
df$gwr_hv246c <- lm.gwr$SDF$hv246c
ggplot(data=df) +
    geom_sf(aes(fill=gwr_hv246c)) +
    scale_fill_gradient2() +
    ggtitle("GWR Horse/Donkey/Mule Coefficients")

# goats
df$gwr_hv246d <- lm.gwr$SDF$hv246d
ggplot(data=df) +
    geom_sf(aes(fill=gwr_hv246d)) +
    scale_fill_gradient2() +
    ggtitle("GWR Goats Coefficients")

# woah, owning goats means you're poor! But more so in the more urban, western region

# chickens
df$gwr_hv246f <- lm.gwr$SDF$hv246f
ggplot(data=df) +
    geom_sf(aes(fill=gwr_hv246f)) +
    scale_fill_gradient2() +
    ggtitle("GWR Chicken Coefficients")

# woah, very negatively correlated with wealth! everywhere.

# bank account
df$gwr_hv247 <- lm.gwr$SDF$hv247
ggplot(data=df) +
    geom_sf(aes(fill=gwr_hv247)) +
    scale_fill_gradient2() +
    ggtitle("GWR Bank Account Coefficients")

Decision Trees

# read in data
sl19clean <- readRDS("./exports/sl19clean.rds")
df <- sl19clean

# filter down to columns of interest
dTreeCols <- c("hv010","hv011","hv012","hv014","hv025","hv040","hv045c","hv201","hv204","hv205","hv206","hv207","hv208","hv209","hv210","hv211","hv212","hv213","hv214","hv215","hv216","hv220","hv221","hv226","hv227","hv230a","hv237","hv241","hv243a","hv243b","hv243c","hv243d","hv243e","hv244","hv246","hv246a","hv246b","hv246c","hv246d","hv246e","hv246f","hv247","hv271")
df <- sl19clean[,dTreeCols]

# clean data
df <- na.omit(df)
dTreeFactors <- c("hv025","hv045c","hv201","hv205","hv206","hv207","hv208","hv209","hv210","hv211","hv212","hv213","hv214","hv215","hv221","hv226","hv227","hv230a","hv237","hv241","hv243a","hv243b","hv243c","hv243d","hv243e","hv244","hv246","hv247")
df <- df %>% mutate_at(dTreeFactors, as.factor)
str(df)
## 'data.frame':    12768 obs. of  43 variables:
##  $ hv010 : num  0 0 1 1 1 2 0 0 1 1 ...
##  $ hv011 : num  1 0 0 0 0 0 1 0 0 0 ...
##  $ hv012 : num  1 5 1 5 22 8 5 2 8 10 ...
##  $ hv014 : num  0 0 0 0 1 1 2 1 2 1 ...
##  $ hv025 : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ hv040 : num  46 46 46 46 46 46 46 46 46 46 ...
##  $ hv045c: Factor w/ 6 levels "1","2","3","4",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ hv201 : Factor w/ 15 levels "11","12","13",..: 4 5 10 4 10 10 10 10 5 5 ...
##  $ hv204 : dbl+lbl [1:12768]  20,  25,  60,  20,  30,  40,  40,  20,  30,  35,  2...
##    ..@ label       : chr "time to get to water source (minutes)"
##    ..@ format.stata: chr "%8.0g"
##    ..@ labels      : Named num  996 998
##    .. ..- attr(*, "names")= chr [1:2] "on premises" "don't know"
##  $ hv205 : Factor w/ 12 levels "11","12","13",..: 9 8 8 7 9 9 7 8 8 8 ...
##  $ hv206 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ hv207 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 2 1 ...
##  $ hv208 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ hv209 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ hv210 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ hv211 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ hv212 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ hv213 : Factor w/ 10 levels "11","12","21",..: 1 1 8 1 1 8 1 1 1 8 ...
##  $ hv214 : Factor w/ 16 levels "11","12","13",..: 4 4 4 3 4 10 4 4 4 10 ...
##  $ hv215 : Factor w/ 14 levels "11","12","13",..: 5 8 2 5 8 8 5 2 8 8 ...
##  $ hv216 : num  3 2 1 4 4 4 2 1 3 4 ...
##  $ hv220 : dbl+lbl [1:12768] 48, 76, 35, 73, 58, 60, 60, 51, 41, 31, 50, 32, 51, ...
##    ..@ label       : chr "age of head of household"
##    ..@ format.stata: chr "%8.0g"
##    ..@ labels      : Named num  95 98
##    .. ..- attr(*, "names")= chr [1:2] "95+" "don't know"
##  $ hv221 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ hv226 : Factor w/ 10 levels "1","2","3","4",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ hv227 : Factor w/ 2 levels "0","1": 2 2 1 2 2 2 2 2 2 1 ...
##  $ hv230a: Factor w/ 5 levels "1","2","3","4",..: 3 2 2 3 3 3 3 2 2 2 ...
##  $ hv237 : Factor w/ 3 levels "0","1","8": 2 2 1 2 1 1 1 1 2 2 ...
##  $ hv241 : Factor w/ 4 levels "1","2","3","6": 2 2 2 2 2 2 2 2 2 2 ...
##  $ hv243a: Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 2 2 1 ...
##  $ hv243b: Factor w/ 2 levels "0","1": 2 1 1 1 2 1 1 1 1 1 ...
##  $ hv243c: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ hv243d: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ hv243e: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ hv244 : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 1 ...
##  $ hv246 : Factor w/ 2 levels "0","1": 2 2 1 2 2 1 1 2 2 2 ...
##  $ hv246a: dbl+lbl [1:12768] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
##    ..@ label       : chr "owns cattle"
##    ..@ format.stata: chr "%8.0g"
##    ..@ labels      : Named num  0 95 98
##    .. ..- attr(*, "names")= chr [1:3] "none" "95 or more" "unknown"
##  $ hv246b: dbl+lbl [1:12768] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
##    ..@ label       : chr "owns cows/ bulls"
##    ..@ format.stata: chr "%8.0g"
##    ..@ labels      : Named num  0 95 98
##    .. ..- attr(*, "names")= chr [1:3] "none" "95 or more" "unknown"
##  $ hv246c: dbl+lbl [1:12768] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
##    ..@ label       : chr "owns horses/ donkeys/ mules"
##    ..@ format.stata: chr "%8.0g"
##    ..@ labels      : Named num  0 95 98
##    .. ..- attr(*, "names")= chr [1:3] "none" "95 or more" "unknown"
##  $ hv246d: dbl+lbl [1:12768] 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0...
##    ..@ label       : chr "owns goats"
##    ..@ format.stata: chr "%8.0g"
##    ..@ labels      : Named num  0 95 98
##    .. ..- attr(*, "names")= chr [1:3] "none" "95 or more" "unknown"
##  $ hv246e: dbl+lbl [1:12768] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
##    ..@ label       : chr "owns sheep"
##    ..@ format.stata: chr "%8.0g"
##    ..@ labels      : Named num  0 95 98
##    .. ..- attr(*, "names")= chr [1:3] "none" "95 or more" "unknown"
##  $ hv246f: dbl+lbl [1:12768]  6,  1,  0, 20, 20,  0,  0, 10, 20,  5,  6,  0,  0, ...
##    ..@ label       : chr "owns chickens/poultry"
##    ..@ format.stata: chr "%8.0g"
##    ..@ labels      : Named num  0 95 98
##    .. ..- attr(*, "names")= chr [1:3] "none" "95 or more" "unknown"
##  $ hv247 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ hv271 : num  -102100 -89700 -88194 -94742 -88105 ...
##  - attr(*, "na.action")= 'omit' Named int [1:195] 99 102 107 489 560 568 620 645 815 1128 ...
##   ..- attr(*, "names")= chr [1:195] "99" "102" "107" "489" ...
# split into test and train data
f = 4/5
n = nrow(df)
set.seed(44)

# random sample without replacement
train <- sample(n, n*f, replace=FALSE)
traindf <- df[train,]
testdf <- df[-train,]

# Decision Tree with 5-Fold Cross Validation
cv <- trainControl(method = 'cv', number = 5)
dt1 <- train(hv271 ~ ., data=df, method="rpart", trControl = cv)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
print(dt1)
## CART 
## 
## 12768 samples
##    42 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 10215, 10214, 10215, 10215, 10213 
## Resampling results across tuning parameters:
## 
##   cp          RMSE      Rsquared   MAE     
##   0.07562718  48749.35  0.7549329  35522.23
##   0.09826903  55382.79  0.6859931  42028.31
##   0.63002492  75844.64  0.6247707  60019.59
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.07562718.
rpart.plot(dt1$finalModel, main="Decision Tree with 5-Fold Cross Validation")

pred1 <- predict(object = dt1, newdata = testdf)
rmse1 <- rmse(actual = testdf$hv271, predicted = pred1 )
rmse1
## [1] 53566.91
## DTree without Cross Validation (manually set cp=0.005) 
cp2 <- 0.005
dt2 <- rpart(hv271 ~ ., data = traindf, cp=cp2, method="anova")
summary(dt2)
## Call:
## rpart(formula = hv271 ~ ., data = traindf, method = "anova", 
##     cp = cp2)
##   n= 10214 
## 
##             CP nsplit  rel error     xerror        xstd
## 1  0.630948665      0 1.00000000 1.00033580 0.015470975
## 2  0.101162893      1 0.36905134 0.37026394 0.007821592
## 3  0.098306311      2 0.26788844 0.28018053 0.006777022
## 4  0.028664760      3 0.16958213 0.17067682 0.005052434
## 5  0.018954143      4 0.14091737 0.14136599 0.004017624
## 6  0.006914147      5 0.12196323 0.12296853 0.003589458
## 7  0.006705071      6 0.11504908 0.11708152 0.003517400
## 8  0.006671769      7 0.10834401 0.11346611 0.003507403
## 9  0.006143629      8 0.10167224 0.10535782 0.003433444
## 10 0.005000000      9 0.09552861 0.09785657 0.003361256
## 
## Variable importance
##  hv226  hv025  hv206  hv208  hv209  hv244  hv213  hv214  hv247 hv243b  hv205 
##     26     15     15     15      9      6      6      3      2      1      1 
## 
## Node number 1: 10214 observations,    complexity param=0.6309487
##   mean=-1603.993, MSE=9.621312e+09 
##   left son=2 (7264 obs) right son=3 (2950 obs)
##   Primary splits:
##       hv226 splits as  RRRRRRRLRR, improve=0.6309487, (0 missing)
##       hv206 splits as  LR,         improve=0.5905257, (0 missing)
##       hv208 splits as  LR,         improve=0.5769706, (0 missing)
##       hv025 splits as  RL,         improve=0.5629566, (0 missing)
##       hv213 splits as  LLRLRRRRRR, improve=0.5082371, (0 missing)
##   Surrogate splits:
##       hv025 splits as  RL, agree=0.873, adj=0.561, (0 split)
##       hv206 splits as  LR, agree=0.843, adj=0.457, (0 split)
##       hv208 splits as  LR, agree=0.824, adj=0.392, (0 split)
##       hv209 splits as  LR, agree=0.786, adj=0.258, (0 split)
##       hv244 splits as  RL, agree=0.778, adj=0.231, (0 split)
## 
## Node number 2: 7264 observations,    complexity param=0.1011629
##   mean=-51256.05, MSE=2.61084e+09 
##   left son=4 (4754 obs) right son=5 (2510 obs)
##   Primary splits:
##       hv213 splits as  LLRLRRRRRL, improve=0.5241979, (0 missing)
##       hv214 splits as  LLLLLLRRLRLLRL-R, improve=0.4770795, (0 missing)
##       hv208 splits as  LR, improve=0.2952548, (0 missing)
##       hv025 splits as  RL, improve=0.2831897, (0 missing)
##       hv206 splits as  LR, improve=0.2708360, (0 missing)
##   Surrogate splits:
##       hv214 splits as  LLLLLLRRLRRLRL-R, agree=0.870, adj=0.624, (0 split)
##       hv025 splits as  RL, agree=0.718, adj=0.185, (0 split)
##       hv205 splits as  RRRR-RRLLLLL, agree=0.692, adj=0.109, (0 split)
##       hv247 splits as  LR, agree=0.690, adj=0.102, (0 split)
##       hv206 splits as  LR, agree=0.680, adj=0.073, (0 split)
## 
## Node number 3: 2950 observations,    complexity param=0.09830631
##   mean=120657.9, MSE=5.865187e+09 
##   left son=6 (1557 obs) right son=7 (1393 obs)
##   Primary splits:
##       hv208 splits as  LR,         improve=0.5583515, (0 missing)
##       hv209 splits as  LR,         improve=0.5171638, (0 missing)
##       hv206 splits as  LR,         improve=0.4869690, (0 missing)
##       hv213 splits as  LRRLRRRLRL, improve=0.3516020, (0 missing)
##       hv247 splits as  LR,         improve=0.3473129, (0 missing)
##   Surrogate splits:
##       hv206  splits as  LR,         agree=0.865, adj=0.714, (0 split)
##       hv209  splits as  LR,         agree=0.782, adj=0.538, (0 split)
##       hv247  splits as  LR,         agree=0.701, adj=0.367, (0 split)
##       hv243b splits as  LR,         agree=0.679, adj=0.321, (0 split)
##       hv213  splits as  LRRLRRRLRR, agree=0.667, adj=0.294, (0 split)
## 
## Node number 4: 4754 observations,    complexity param=0.006705071
##   mean=-78137.03, MSE=6.42506e+08 
##   left son=8 (1041 obs) right son=9 (3713 obs)
##   Primary splits:
##       hv215  splits as  LLRRLRRRR-RR--, improve=0.2157234, (0 missing)
##       hv214  splits as  LLLLLL--LRLLRL-R, improve=0.1997467, (0 missing)
##       hv243a splits as  LR, improve=0.1941441, (0 missing)
##       hv025  splits as  RL, improve=0.1755999, (0 missing)
##       hv205  splits as  --R--RRLLLRL, improve=0.1482406, (0 missing)
##   Surrogate splits:
##       hv214  splits as  LLRRRL--RRRRRR-R, agree=0.786, adj=0.023, (0 split)
##       hv241  splits as  LRRR, agree=0.784, adj=0.014, (0 split)
##       hv246b < 3.5  to the right, agree=0.784, adj=0.014, (0 split)
##       hv243d splits as  RL, agree=0.783, adj=0.008, (0 split)
##       hv246d < 18   to the right, agree=0.782, adj=0.007, (0 split)
## 
## Node number 5: 2510 observations,    complexity param=0.02866476
##   mean=-342.8375, MSE=2.37816e+09 
##   left son=10 (2303 obs) right son=11 (207 obs)
##   Primary splits:
##       hv208 splits as  LR, improve=0.4719149, (0 missing)
##       hv206 splits as  LR, improve=0.4138054, (0 missing)
##       hv209 splits as  LR, improve=0.3189899, (0 missing)
##       hv247 splits as  LR, improve=0.3165087, (0 missing)
##       hv025 splits as  RL, improve=0.2241800, (0 missing)
##   Surrogate splits:
##       hv206  splits as  LR, agree=0.948, adj=0.367, (0 split)
##       hv209  splits as  LR, agree=0.940, adj=0.271, (0 split)
##       hv243e splits as  LR, agree=0.924, adj=0.072, (0 split)
##       hv221  splits as  LR, agree=0.918, adj=0.010, (0 split)
## 
## Node number 6: 1557 observations,    complexity param=0.006914147
##   mean=66529.39, MSE=1.897943e+09 
##   left son=12 (220 obs) right son=13 (1337 obs)
##   Primary splits:
##       hv213  splits as  L--L-RRRRR, improve=0.2299307, (0 missing)
##       hv214  splits as  LLLLLLR-RRRLRRRR, improve=0.2260315, (0 missing)
##       hv206  splits as  LR, improve=0.1899410, (0 missing)
##       hv205  splits as  LRRRLRLLLLLL, improve=0.1631874, (0 missing)
##       hv243b splits as  LR, improve=0.1601806, (0 missing)
##   Surrogate splits:
##       hv214 splits as  LRLLLRR-LRRLRRRR, agree=0.885, adj=0.186, (0 split)
##       hv215 splits as  RL-LLRRRR-RR--, agree=0.863, adj=0.027, (0 split)
##       hv226 splits as  LR-RRRR-LR, agree=0.861, adj=0.014, (0 split)
##       hv201 splits as  RRRRRRRRRLRRRRR, agree=0.859, adj=0.005, (0 split)
##       hv205 splits as  RRRRRRRRRLRR, agree=0.859, adj=0.005, (0 split)
## 
## Node number 7: 1393 observations,    complexity param=0.01895414
##   mean=181159, MSE=3.364277e+09 
##   left son=14 (1035 obs) right son=15 (358 obs)
##   Primary splits:
##       hv205  splits as  LRLL-LLLL-LL, improve=0.3974583, (0 missing)
##       hv213  splits as  LLL-RLRLRL,   improve=0.3700823, (0 missing)
##       hv209  splits as  LR,           improve=0.3503209, (0 missing)
##       hv243e splits as  LR,           improve=0.3433737, (0 missing)
##       hv212  splits as  LR,           improve=0.2853069, (0 missing)
##   Surrogate splits:
##       hv241 splits as  RLLL, agree=0.792, adj=0.190, (0 split)
##       hv212 splits as  LR, agree=0.781, adj=0.148, (0 split)
##       hv213 splits as  LLL-RLRLLL, agree=0.757, adj=0.053, (0 split)
##       hv201 splits as  RLLLLLLLRLLLLRL, agree=0.753, adj=0.039, (0 split)
##       hv221 splits as  LR, agree=0.749, adj=0.022, (0 split)
## 
## Node number 8: 1041 observations
##   mean=-100371.4, MSE=2.632583e+08 
## 
## Node number 9: 3713 observations
##   mean=-71903.27, MSE=5.713709e+08 
## 
## Node number 10: 2303 observations,    complexity param=0.006671769
##   mean=-10386.46, MSE=1.10947e+09 
##   left son=20 (1692 obs) right son=21 (611 obs)
##   Primary splits:
##       hv025  splits as  RL,           improve=0.2566029, (0 missing)
##       hv247  splits as  LR,           improve=0.2290053, (0 missing)
##       hv205  splits as  LRRR-RRLLLRL, improve=0.2029599, (0 missing)
##       hv243a splits as  LR,           improve=0.1788949, (0 missing)
##       hv207  splits as  LR,           improve=0.1434461, (0 missing)
##   Surrogate splits:
##       hv201 splits as  LRRLLLLLLLL---L, agree=0.744, adj=0.036, (0 split)
##       hv237 splits as  LLR, agree=0.743, adj=0.033, (0 split)
##       hv206 splits as  LR, agree=0.739, adj=0.018, (0 split)
##       hv214 splits as  -LLLL-RL-LRLR--R, agree=0.738, adj=0.013, (0 split)
##       hv040 < 1.5  to the right, agree=0.737, adj=0.008, (0 split)
## 
## Node number 11: 207 observations
##   mean=111398.6, MSE=2.884666e+09 
## 
## Node number 12: 220 observations
##   mean=15030.9, MSE=1.294514e+09 
## 
## Node number 13: 1337 observations
##   mean=75003.33, MSE=1.489033e+09 
## 
## Node number 14: 1035 observations,    complexity param=0.006143629
##   mean=159652.9, MSE=1.759999e+09 
##   left son=28 (528 obs) right son=29 (507 obs)
##   Primary splits:
##       hv209  splits as  LR, improve=0.3314381, (0 missing)
##       hv243e splits as  LR, improve=0.2542904, (0 missing)
##       hv247  splits as  LR, improve=0.2466412, (0 missing)
##       hv213  splits as  LLL--LRLRL, improve=0.2332752, (0 missing)
##       hv201  splits as  RLLLLLLLLLLRLRR, improve=0.1663839, (0 missing)
##   Surrogate splits:
##       hv247  splits as  LR,         agree=0.611, adj=0.205, (0 split)
##       hv216  < 1.5  to the left,    agree=0.603, adj=0.189, (0 split)
##       hv213  splits as  LRL--LRLRL, agree=0.598, adj=0.179, (0 split)
##       hv220  < 33.5 to the left,    agree=0.587, adj=0.158, (0 split)
##       hv243e splits as  LR,         agree=0.583, adj=0.148, (0 split)
## 
## Node number 15: 358 observations
##   mean=243334.7, MSE=2.799372e+09 
## 
## Node number 20: 1692 observations
##   mean=-20525.79, MSE=7.654139e+08 
## 
## Node number 21: 611 observations
##   mean=17691.67, MSE=9.891666e+08 
## 
## Node number 28: 528 observations
##   mean=135985.8, MSE=9.651528e+08 
## 
## Node number 29: 507 observations
##   mean=184300.2, MSE=1.396944e+09
rpart.plot(dt2, extra=1, main="Decision Tree with cp=0.005")

pred2 <- predict(object = dt2, newdata = testdf)
rmse2 <- rmse(actual = testdf$hv271, predicted = pred2 )
rmse2
## [1] 32329.84
dt2$variable.importance
##        hv226        hv025        hv206        hv208        hv209        hv244 
## 6.201391e+13 3.726203e+13 3.700438e+13 3.679614e+13 2.256233e+13 1.431361e+13 
##        hv213        hv214        hv247       hv243b        hv205        hv215 
## 1.367162e+13 6.352942e+12 4.677737e+12 3.100045e+12 2.947037e+12 6.774523e+11 
##        hv241       hv243e        hv212        hv216        hv201        hv220 
## 3.632965e+11 2.934382e+11 2.757574e+11 1.143190e+11 9.953771e+10 9.526583e+10 
##        hv221        hv237       hv246b        hv040       hv243d       hv246d 
## 6.884063e+10 2.146149e+10 9.494544e+09 5.365374e+09 5.063757e+09 4.430787e+09
## Deeper DTree without Cross Validation (manually set cp=0.001) for more depth
cp3 <- 0.001
dt3 <- rpart(hv271 ~ ., data = traindf, cp=cp3, method="anova")
summary(dt3)
## Call:
## rpart(formula = hv271 ~ ., data = traindf, method = "anova", 
##     cp = cp3)
##   n= 10214 
## 
##             CP nsplit  rel error     xerror        xstd
## 1  0.630948665      0 1.00000000 1.00020395 0.015472453
## 2  0.101162893      1 0.36905134 0.37028576 0.007822632
## 3  0.098306311      2 0.26788844 0.26118562 0.006546728
## 4  0.028664760      3 0.16958213 0.17071200 0.005054195
## 5  0.018954143      4 0.14091737 0.14141064 0.004017292
## 6  0.006914147      5 0.12196323 0.12287966 0.003578845
## 7  0.006705071      6 0.11504908 0.11693149 0.003510940
## 8  0.006671769      7 0.10834401 0.11292743 0.003462634
## 9  0.006143629      8 0.10167224 0.10482036 0.003426244
## 10 0.004692157      9 0.09552861 0.09858180 0.003357561
## 11 0.003978692     10 0.09083645 0.09293645 0.003264036
## 12 0.003960219     11 0.08685776 0.08999181 0.003242922
## 13 0.003386240     12 0.08289754 0.08588757 0.003059009
## 14 0.002788733     13 0.07951130 0.08339450 0.002975613
## 15 0.002487637     14 0.07672257 0.07936435 0.002938308
## 16 0.001944458     15 0.07423493 0.07717209 0.002907692
## 17 0.001789649     16 0.07229048 0.07531841 0.002896222
## 18 0.001760663     17 0.07050083 0.07313251 0.002805449
## 19 0.001722961     18 0.06874016 0.07263000 0.002797436
## 20 0.001448665     19 0.06701720 0.07115337 0.002734830
## 21 0.001435067     20 0.06556854 0.07043797 0.002728605
## 22 0.001407958     21 0.06413347 0.07032118 0.002727825
## 23 0.001254925     22 0.06272551 0.06801393 0.002538144
## 24 0.001094580     23 0.06147059 0.06609345 0.002542648
## 25 0.001086488     24 0.06037601 0.06479144 0.002510171
## 26 0.001053168     25 0.05928952 0.06465067 0.002501184
## 27 0.001009545     26 0.05823635 0.06343546 0.002488078
## 28 0.001000000     27 0.05722681 0.06273256 0.002382972
## 
## Variable importance
##  hv226  hv025  hv206  hv208  hv209  hv244  hv213  hv214  hv247 hv243b  hv205 
##     25     15     15     15      9      6      6      3      2      1      1 
## 
## Node number 1: 10214 observations,    complexity param=0.6309487
##   mean=-1603.993, MSE=9.621312e+09 
##   left son=2 (7264 obs) right son=3 (2950 obs)
##   Primary splits:
##       hv226 splits as  RRRRRRRLRR, improve=0.6309487, (0 missing)
##       hv206 splits as  LR,         improve=0.5905257, (0 missing)
##       hv208 splits as  LR,         improve=0.5769706, (0 missing)
##       hv025 splits as  RL,         improve=0.5629566, (0 missing)
##       hv213 splits as  LLRLRRRRRR, improve=0.5082371, (0 missing)
##   Surrogate splits:
##       hv025 splits as  RL, agree=0.873, adj=0.561, (0 split)
##       hv206 splits as  LR, agree=0.843, adj=0.457, (0 split)
##       hv208 splits as  LR, agree=0.824, adj=0.392, (0 split)
##       hv209 splits as  LR, agree=0.786, adj=0.258, (0 split)
##       hv244 splits as  RL, agree=0.778, adj=0.231, (0 split)
## 
## Node number 2: 7264 observations,    complexity param=0.1011629
##   mean=-51256.05, MSE=2.61084e+09 
##   left son=4 (4754 obs) right son=5 (2510 obs)
##   Primary splits:
##       hv213 splits as  LLRLRRRRRL, improve=0.5241979, (0 missing)
##       hv214 splits as  LLLLLLRRLRLLRL-R, improve=0.4770795, (0 missing)
##       hv208 splits as  LR, improve=0.2952548, (0 missing)
##       hv025 splits as  RL, improve=0.2831897, (0 missing)
##       hv206 splits as  LR, improve=0.2708360, (0 missing)
##   Surrogate splits:
##       hv214 splits as  LLLLLLRRLRRLRL-R, agree=0.870, adj=0.624, (0 split)
##       hv025 splits as  RL, agree=0.718, adj=0.185, (0 split)
##       hv205 splits as  RRRR-RRLLLLL, agree=0.692, adj=0.109, (0 split)
##       hv247 splits as  LR, agree=0.690, adj=0.102, (0 split)
##       hv206 splits as  LR, agree=0.680, adj=0.073, (0 split)
## 
## Node number 3: 2950 observations,    complexity param=0.09830631
##   mean=120657.9, MSE=5.865187e+09 
##   left son=6 (1557 obs) right son=7 (1393 obs)
##   Primary splits:
##       hv208 splits as  LR,         improve=0.5583515, (0 missing)
##       hv209 splits as  LR,         improve=0.5171638, (0 missing)
##       hv206 splits as  LR,         improve=0.4869690, (0 missing)
##       hv213 splits as  LRRLRRRLRL, improve=0.3516020, (0 missing)
##       hv247 splits as  LR,         improve=0.3473129, (0 missing)
##   Surrogate splits:
##       hv206  splits as  LR,         agree=0.865, adj=0.714, (0 split)
##       hv209  splits as  LR,         agree=0.782, adj=0.538, (0 split)
##       hv247  splits as  LR,         agree=0.701, adj=0.367, (0 split)
##       hv243b splits as  LR,         agree=0.679, adj=0.321, (0 split)
##       hv213  splits as  LRRLRRRLRR, agree=0.667, adj=0.294, (0 split)
## 
## Node number 4: 4754 observations,    complexity param=0.006705071
##   mean=-78137.03, MSE=6.42506e+08 
##   left son=8 (1041 obs) right son=9 (3713 obs)
##   Primary splits:
##       hv215  splits as  LLRRLRRRR-RR--, improve=0.2157234, (0 missing)
##       hv214  splits as  LLLLLL--LRLLRL-R, improve=0.1997467, (0 missing)
##       hv243a splits as  LR, improve=0.1941441, (0 missing)
##       hv025  splits as  RL, improve=0.1755999, (0 missing)
##       hv205  splits as  --R--RRLLLRL, improve=0.1482406, (0 missing)
##   Surrogate splits:
##       hv214  splits as  LLRRRL--RRRRRR-R, agree=0.786, adj=0.023, (0 split)
##       hv241  splits as  LRRR, agree=0.784, adj=0.014, (0 split)
##       hv246b < 3.5   to the right, agree=0.784, adj=0.014, (0 split)
##       hv243d splits as  RL, agree=0.783, adj=0.008, (0 split)
##       hv246d < 18    to the right, agree=0.782, adj=0.007, (0 split)
## 
## Node number 5: 2510 observations,    complexity param=0.02866476
##   mean=-342.8375, MSE=2.37816e+09 
##   left son=10 (2303 obs) right son=11 (207 obs)
##   Primary splits:
##       hv208 splits as  LR, improve=0.4719149, (0 missing)
##       hv206 splits as  LR, improve=0.4138054, (0 missing)
##       hv209 splits as  LR, improve=0.3189899, (0 missing)
##       hv247 splits as  LR, improve=0.3165087, (0 missing)
##       hv025 splits as  RL, improve=0.2241800, (0 missing)
##   Surrogate splits:
##       hv206  splits as  LR, agree=0.948, adj=0.367, (0 split)
##       hv209  splits as  LR, agree=0.940, adj=0.271, (0 split)
##       hv243e splits as  LR, agree=0.924, adj=0.072, (0 split)
##       hv221  splits as  LR, agree=0.918, adj=0.010, (0 split)
## 
## Node number 6: 1557 observations,    complexity param=0.006914147
##   mean=66529.39, MSE=1.897943e+09 
##   left son=12 (220 obs) right son=13 (1337 obs)
##   Primary splits:
##       hv213  splits as  L--L-RRRRR, improve=0.2299307, (0 missing)
##       hv214  splits as  LLLLLLR-RRRLRRRR, improve=0.2260315, (0 missing)
##       hv206  splits as  LR, improve=0.1899410, (0 missing)
##       hv205  splits as  LRRRLRLLLLLL, improve=0.1631874, (0 missing)
##       hv243b splits as  LR, improve=0.1601806, (0 missing)
##   Surrogate splits:
##       hv214 splits as  LRLLLRR-LRRLRRRR, agree=0.885, adj=0.186, (0 split)
##       hv215 splits as  RL-LLRRRR-RR--, agree=0.863, adj=0.027, (0 split)
##       hv226 splits as  LR-RRRR-LR, agree=0.861, adj=0.014, (0 split)
##       hv201 splits as  RRRRRRRRRLRRRRR, agree=0.859, adj=0.005, (0 split)
##       hv205 splits as  RRRRRRRRRLRR, agree=0.859, adj=0.005, (0 split)
## 
## Node number 7: 1393 observations,    complexity param=0.01895414
##   mean=181159, MSE=3.364277e+09 
##   left son=14 (1035 obs) right son=15 (358 obs)
##   Primary splits:
##       hv205  splits as  LRLL-LLLL-LL, improve=0.3974583, (0 missing)
##       hv213  splits as  LLL-RLRLRL,   improve=0.3700823, (0 missing)
##       hv209  splits as  LR,           improve=0.3503209, (0 missing)
##       hv243e splits as  LR,           improve=0.3433737, (0 missing)
##       hv212  splits as  LR,           improve=0.2853069, (0 missing)
##   Surrogate splits:
##       hv241 splits as  RLLL, agree=0.792, adj=0.190, (0 split)
##       hv212 splits as  LR, agree=0.781, adj=0.148, (0 split)
##       hv213 splits as  LLL-RLRLLL, agree=0.757, adj=0.053, (0 split)
##       hv201 splits as  RLLLLLLLRLLLLRL, agree=0.753, adj=0.039, (0 split)
##       hv221 splits as  LR, agree=0.749, adj=0.022, (0 split)
## 
## Node number 8: 1041 observations
##   mean=-100371.4, MSE=2.632583e+08 
## 
## Node number 9: 3713 observations,    complexity param=0.004692157
##   mean=-71903.27, MSE=5.713709e+08 
##   left son=18 (1713 obs) right son=19 (2000 obs)
##   Primary splits:
##       hv243a splits as  LR, improve=0.2173500, (0 missing)
##       hv025  splits as  RL, improve=0.1960208, (0 missing)
##       hv214  splits as  LLLLLL--LRLLRL-R, improve=0.1929872, (0 missing)
##       hv205  splits as  --R--RRLLLRL, improve=0.1490087, (0 missing)
##       hv243b splits as  LR, improve=0.1375063, (0 missing)
##   Surrogate splits:
##       hv207  splits as  LR,        agree=0.643, adj=0.227, (0 split)
##       hv243b splits as  LR,        agree=0.611, adj=0.157, (0 split)
##       hv246  splits as  LR,        agree=0.590, adj=0.110, (0 split)
##       hv246f < 0.5   to the left,  agree=0.584, adj=0.098, (0 split)
##       hv012  < 4.5   to the left,  agree=0.584, adj=0.097, (0 split)
## 
## Node number 10: 2303 observations,    complexity param=0.006671769
##   mean=-10386.46, MSE=1.10947e+09 
##   left son=20 (1692 obs) right son=21 (611 obs)
##   Primary splits:
##       hv025  splits as  RL,           improve=0.2566029, (0 missing)
##       hv247  splits as  LR,           improve=0.2290053, (0 missing)
##       hv205  splits as  LRRR-RRLLLRL, improve=0.2029599, (0 missing)
##       hv243a splits as  LR,           improve=0.1788949, (0 missing)
##       hv207  splits as  LR,           improve=0.1434461, (0 missing)
##   Surrogate splits:
##       hv201 splits as  LRRLLLLLLLL---L, agree=0.744, adj=0.036, (0 split)
##       hv237 splits as  LLR, agree=0.743, adj=0.033, (0 split)
##       hv206 splits as  LR, agree=0.739, adj=0.018, (0 split)
##       hv214 splits as  -LLLL-RL-LRLR--R, agree=0.738, adj=0.013, (0 split)
##       hv040 < 1.5   to the right, agree=0.737, adj=0.008, (0 split)
## 
## Node number 11: 207 observations,    complexity param=0.001789649
##   mean=111398.6, MSE=2.884666e+09 
##   left son=22 (133 obs) right son=23 (74 obs)
##   Primary splits:
##       hv209  splits as  LR,         improve=0.2945318, (0 missing)
##       hv213  splits as  ----LRRLL-, improve=0.2208200, (0 missing)
##       hv243e splits as  LR,         improve=0.2032407, (0 missing)
##       hv206  splits as  LR,         improve=0.1743461, (0 missing)
##       hv212  splits as  LR,         improve=0.1658422, (0 missing)
##   Surrogate splits:
##       hv243e splits as  LR, agree=0.671, adj=0.081, (0 split)
##       hv011  < 3.5   to the left,  agree=0.667, adj=0.068, (0 split)
##       hv210  splits as  LR, agree=0.662, adj=0.054, (0 split)
##       hv012  < 9.5   to the left,  agree=0.657, adj=0.041, (0 split)
##       hv201  splits as  RRRLLLLLLLL---L, agree=0.657, adj=0.041, (0 split)
## 
## Node number 12: 220 observations,    complexity param=0.001009545
##   mean=15030.9, MSE=1.294514e+09 
##   left son=24 (63 obs) right son=25 (157 obs)
##   Primary splits:
##       hv025 splits as  RL, improve=0.3483584, (0 missing)
##       hv214 splits as  LLLLL---RR-LR--R, improve=0.2306552, (0 missing)
##       hv206 splits as  LR, improve=0.2293939, (0 missing)
##       hv205 splits as  RRR--RRLLLRL, improve=0.1857657, (0 missing)
##       hv201 splits as  --RRLRLRLLR-R-R, improve=0.1815635, (0 missing)
##   Surrogate splits:
##       hv205 splits as  RRR--RRRLRRR, agree=0.759, adj=0.159, (0 split)
##       hv214 splits as  RLRLR---RR-RR--R, agree=0.755, adj=0.143, (0 split)
##       hv215 splits as  -L-RL--RL-RR--, agree=0.745, adj=0.111, (0 split)
##       hv201 splits as  --RRRRRRLRR-R-R, agree=0.727, adj=0.048, (0 split)
##       hv226 splits as  RL---RR-L-, agree=0.727, adj=0.048, (0 split)
## 
## Node number 13: 1337 observations,    complexity param=0.003978692
##   mean=75003.33, MSE=1.489033e+09 
##   left son=26 (1070 obs) right son=27 (267 obs)
##   Primary splits:
##       hv206  splits as  LR,           improve=0.1963970, (0 missing)
##       hv213  splits as  -----RRLLL,   improve=0.1894837, (0 missing)
##       hv247  splits as  LR,           improve=0.1882796, (0 missing)
##       hv205  splits as  LRRRLLLLLLLL, improve=0.1742404, (0 missing)
##       hv243b splits as  LR,           improve=0.1682900, (0 missing)
##   Surrogate splits:
##       hv209 splits as  LR, agree=0.818, adj=0.086, (0 split)
##       hv040 < -5    to the right, agree=0.802, adj=0.007, (0 split)
##       hv205 splits as  LLLRLLLLLLLL, agree=0.802, adj=0.007, (0 split)
##       hv215 splits as  RL-R-LLLL-LL--, agree=0.802, adj=0.007, (0 split)
##       hv226 splits as  -L-RLLL--L, agree=0.801, adj=0.004, (0 split)
## 
## Node number 14: 1035 observations,    complexity param=0.006143629
##   mean=159652.9, MSE=1.759999e+09 
##   left son=28 (528 obs) right son=29 (507 obs)
##   Primary splits:
##       hv209  splits as  LR, improve=0.3314381, (0 missing)
##       hv243e splits as  LR, improve=0.2542904, (0 missing)
##       hv247  splits as  LR, improve=0.2466412, (0 missing)
##       hv213  splits as  LLL--LRLRL, improve=0.2332752, (0 missing)
##       hv201  splits as  RLLLLLLLLLLRLRR, improve=0.1663839, (0 missing)
##   Surrogate splits:
##       hv247  splits as  LR,         agree=0.611, adj=0.205, (0 split)
##       hv216  < 1.5   to the left,   agree=0.603, adj=0.189, (0 split)
##       hv213  splits as  LRL--LRLRL, agree=0.598, adj=0.179, (0 split)
##       hv220  < 33.5  to the left,   agree=0.587, adj=0.158, (0 split)
##       hv243e splits as  LR,         agree=0.583, adj=0.148, (0 split)
## 
## Node number 15: 358 observations,    complexity param=0.003960219
##   mean=243334.7, MSE=2.799372e+09 
##   left son=30 (207 obs) right son=31 (151 obs)
##   Primary splits:
##       hv243e splits as  LR, improve=0.3883342, (0 missing)
##       hv212  splits as  LR, improve=0.3477234, (0 missing)
##       hv213  splits as  --L-RLRLR-, improve=0.2885903, (0 missing)
##       hv201  splits as  RLLLLLLLLLLR-RR, improve=0.2543288, (0 missing)
##       hv209  splits as  LR, improve=0.2383280, (0 missing)
##   Surrogate splits:
##       hv212 splits as  LR, agree=0.707, adj=0.305, (0 split)
##       hv226 splits as  RRRR-LL-R-, agree=0.628, adj=0.119, (0 split)
##       hv201 splits as  RLLLLLLLLRLL-RR, agree=0.626, adj=0.113, (0 split)
##       hv247 splits as  LR, agree=0.626, adj=0.113, (0 split)
##       hv241 splits as  RLL-, agree=0.623, adj=0.106, (0 split)
## 
## Node number 18: 1713 observations,    complexity param=0.00109458
##   mean=-83944.62, MSE=1.954337e+08 
##   left son=36 (1519 obs) right son=37 (194 obs)
##   Primary splits:
##       hv214 splits as  LLLLL----RLLRL--, improve=0.32130740, (0 missing)
##       hv205 splits as  --R--RRLLLRL, improve=0.21453740, (0 missing)
##       hv201 splits as  RRRRRRLRLLLR---, improve=0.17294140, (0 missing)
##       hv025 splits as  RL, improve=0.14849050, (0 missing)
##       hv207 splits as  LR, improve=0.07991174, (0 missing)
##   Surrogate splits:
##       hv201 splits as  LLLLLLLLLLLR---, agree=0.887, adj=0.005, (0 split)
##       hv205 splits as  --L--LLLLLRL, agree=0.887, adj=0.005, (0 split)
##       hv215 splits as  --LL-LLLL--R--, agree=0.887, adj=0.005, (0 split)
##       hv221 splits as  LR, agree=0.887, adj=0.005, (0 split)
## 
## Node number 19: 2000 observations,    complexity param=0.002788733
##   mean=-61589.85, MSE=6.62807e+08 
##   left son=38 (1796 obs) right son=39 (204 obs)
##   Primary splits:
##       hv025 splits as  RL, improve=0.2067378, (0 missing)
##       hv208 splits as  LR, improve=0.1728325, (0 missing)
##       hv214 splits as  LLLLLL--LRLLR--R, improve=0.1727517, (0 missing)
##       hv212 splits as  LR, improve=0.1700778, (0 missing)
##       hv209 splits as  LR, improve=0.1571962, (0 missing)
##   Surrogate splits:
##       hv237 splits as  LLR, agree=0.901, adj=0.029, (0 split)
##       hv206 splits as  LR, agree=0.900, adj=0.025, (0 split)
##       hv201 splits as  LRLLLLLLLLLL--L, agree=0.899, adj=0.010, (0 split)
##       hv214 splits as  LLLLLL--LLLLL--R, agree=0.899, adj=0.010, (0 split)
##       hv215 splits as  ---L-L-LR-LL--, agree=0.898, adj=0.005, (0 split)
## 
## Node number 20: 1692 observations,    complexity param=0.00338624
##   mean=-20525.79, MSE=7.654139e+08 
##   left son=40 (1556 obs) right son=41 (136 obs)
##   Primary splits:
##       hv247  splits as  LR,           improve=0.2569515, (0 missing)
##       hv243a splits as  LR,           improve=0.1608756, (0 missing)
##       hv207  splits as  LR,           improve=0.1564948, (0 missing)
##       hv205  splits as  RRR--RRLLLRL, improve=0.1490329, (0 missing)
##       hv211  splits as  LR,           improve=0.1421848, (0 missing)
##   Surrogate splits:
##       hv201  splits as  LLLLLLLLLLL---R, agree=0.921, adj=0.022, (0 split)
##       hv215  splits as  LLLLLLRLLLLL--, agree=0.921, adj=0.022, (0 split)
##       hv243e splits as  LR, agree=0.921, adj=0.022, (0 split)
##       hv241  splits as  RLLL, agree=0.921, adj=0.015, (0 split)
##       hv213  splits as  --L-LRLLL-, agree=0.920, adj=0.007, (0 split)
## 
## Node number 21: 611 observations,    complexity param=0.001435067
##   mean=17691.67, MSE=9.891666e+08 
##   left son=42 (341 obs) right son=43 (270 obs)
##   Primary splits:
##       hv243b splits as  LR, improve=0.2333413, (0 missing)
##       hv247  splits as  LR, improve=0.1808688, (0 missing)
##       hv213  splits as  ----L-RLL-, improve=0.1764094, (0 missing)
##       hv214  splits as  -LLLL-R--RLLR--R, improve=0.1588980, (0 missing)
##       hv243a splits as  LR, improve=0.1573900, (0 missing)
##   Surrogate splits:
##       hv216 < 3.5   to the left,  agree=0.637, adj=0.178, (0 split)
##       hv247 splits as  LR,        agree=0.630, adj=0.163, (0 split)
##       hv012 < 7.5   to the left,  agree=0.627, adj=0.156, (0 split)
##       hv211 splits as  LR,        agree=0.615, adj=0.130, (0 split)
##       hv207 splits as  LR,        agree=0.601, adj=0.096, (0 split)
## 
## Node number 22: 133 observations
##   mean=89656.36, MSE=1.651197e+09 
## 
## Node number 23: 74 observations
##   mean=150475.8, MSE=2.724921e+09 
## 
## Node number 24: 63 observations
##   mean=-18492.35, MSE=4.807779e+08 
## 
## Node number 25: 157 observations
##   mean=28482.9, MSE=9.891338e+08 
## 
## Node number 26: 1070 observations,    complexity param=0.002487637
##   mean=66460.87, MSE=1.06226e+09 
##   left son=52 (958 obs) right son=53 (112 obs)
##   Primary splits:
##       hv213  splits as  -----RRLLL, improve=0.2150813, (0 missing)
##       hv247  splits as  LR, improve=0.1890525, (0 missing)
##       hv243b splits as  LR, improve=0.1839440, (0 missing)
##       hv205  splits as  LRRLLRLLLLRL, improve=0.1777065, (0 missing)
##       hv201  splits as  RLLLLLLLLLLRLRR, improve=0.1400835, (0 missing)
##   Surrogate splits:
##       hv205  splits as  LRLLLLLLLLLL, agree=0.898, adj=0.027, (0 split)
##       hv012  < 20.5  to the left,     agree=0.897, adj=0.018, (0 split)
##       hv246a < 7.5   to the left,     agree=0.897, adj=0.018, (0 split)
## 
## Node number 27: 267 observations,    complexity param=0.001448665
##   mean=109237.2, MSE=1.734924e+09 
##   left son=54 (245 obs) right son=55 (22 obs)
##   Primary splits:
##       hv205  splits as  LRLR-LLLL-LL, improve=0.3073310, (0 missing)
##       hv243e splits as  LR,           improve=0.2753632, (0 missing)
##       hv213  splits as  -----LRL--,   improve=0.2551368, (0 missing)
##       hv247  splits as  LR,           improve=0.2178186, (0 missing)
##       hv209  splits as  LR,           improve=0.2067656, (0 missing)
##   Surrogate splits:
##       hv215  splits as  L--L--LLL--R--, agree=0.925, adj=0.091, (0 split)
##       hv201  splits as  RLLLLLLLLLL-L-L, agree=0.921, adj=0.045, (0 split)
##       hv246f < 13.5  to the left,  agree=0.921, adj=0.045, (0 split)
## 
## Node number 28: 528 observations,    complexity param=0.001053168
##   mean=135985.8, MSE=9.651528e+08 
##   left son=56 (328 obs) right son=57 (200 obs)
##   Primary splits:
##       hv247  splits as  LR, improve=0.2030944, (0 missing)
##       hv213  splits as  L-L--RRLRR, improve=0.1879628, (0 missing)
##       hv201  splits as  -RLLLLLL-LL-L-R, improve=0.1866501, (0 missing)
##       hv243e splits as  LR, improve=0.1753470, (0 missing)
##       hv243b splits as  LR, improve=0.1635960, (0 missing)
##   Surrogate splits:
##       hv243e splits as  LR,         agree=0.655, adj=0.090, (0 split)
##       hv213  splits as  L-R--LRLLR, agree=0.642, adj=0.055, (0 split)
##       hv040  < 381.5 to the left,   agree=0.638, adj=0.045, (0 split)
##       hv216  < 3.5   to the left,   agree=0.638, adj=0.045, (0 split)
##       hv212  splits as  LR,         agree=0.636, adj=0.040, (0 split)
## 
## Node number 29: 507 observations,    complexity param=0.001944458
##   mean=184300.2, MSE=1.396944e+09 
##   left son=58 (390 obs) right son=59 (117 obs)
##   Primary splits:
##       hv243e splits as  LR, improve=0.2697998, (0 missing)
##       hv213  splits as  LLR--LRLRL, improve=0.2383709, (0 missing)
##       hv247  splits as  LR, improve=0.2279579, (0 missing)
##       hv201  splits as  RLLLLLLLLLLR-RR, improve=0.1857227, (0 missing)
##       hv212  splits as  LR, improve=0.1404558, (0 missing)
##   Surrogate splits:
##       hv214 splits as  -LR-L-RLRLLRLLRL, agree=0.781, adj=0.051, (0 split)
##       hv226 splits as  RRRL-LL---, agree=0.777, adj=0.034, (0 split)
##       hv215 splits as  ---R-RLL-LRL--, agree=0.775, adj=0.026, (0 split)
##       hv221 splits as  LR, agree=0.773, adj=0.017, (0 split)
##       hv213 splits as  LLR--LLLLL, agree=0.771, adj=0.009, (0 split)
## 
## Node number 30: 207 observations,    complexity param=0.001086488
##   mean=215174.4, MSE=1.634418e+09 
##   left son=60 (85 obs) right son=61 (122 obs)
##   Primary splits:
##       hv213  splits as  --L--LRLR-, improve=0.3155890, (0 missing)
##       hv209  splits as  LR, improve=0.2667738, (0 missing)
##       hv201  splits as  LRLLLLRLL-LR--R, improve=0.2016098, (0 missing)
##       hv230a splits as  RLLRL, improve=0.1705785, (0 missing)
##       hv025  splits as  LR, improve=0.1524198, (0 missing)
##   Surrogate splits:
##       hv220  < 31.5  to the left,  agree=0.633, adj=0.106, (0 split)
##       hv230a splits as  RLRRL, agree=0.633, adj=0.106, (0 split)
##       hv201  splits as  LRRRLRRRL-LL--R, agree=0.628, adj=0.094, (0 split)
##       hv214  splits as  ------R--R-LR-LL, agree=0.628, adj=0.094, (0 split)
##       hv045c splits as  -RRLRR, agree=0.618, adj=0.071, (0 split)
## 
## Node number 31: 151 observations
##   mean=281938.5, MSE=1.819019e+09 
## 
## Node number 36: 1519 observations
##   mean=-86776.55, MSE=1.160144e+08 
## 
## Node number 37: 194 observations
##   mean=-61770.92, MSE=2.628121e+08 
## 
## Node number 38: 1796 observations,    complexity param=0.001760663
##   mean=-65535.02, MSE=4.76771e+08 
##   left son=76 (1780 obs) right son=77 (16 obs)
##   Primary splits:
##       hv208  splits as  LR, improve=0.2020646, (0 missing)
##       hv243e splits as  LR, improve=0.1855175, (0 missing)
##       hv214  splits as  LLLLLL--LRLLR--R, improve=0.1474377, (0 missing)
##       hv211  splits as  LR, improve=0.1456801, (0 missing)
##       hv247  splits as  LR, improve=0.1346353, (0 missing)
## 
## Node number 39: 204 observations
##   mean=-26856.91, MSE=9.57249e+08 
## 
## Node number 40: 1556 observations,    complexity param=0.001407958
##   mean=-24671.88, MSE=4.827509e+08 
##   left son=80 (866 obs) right son=81 (690 obs)
##   Primary splits:
##       hv205  splits as  RRR--RRLLLRL, improve=0.1841990, (0 missing)
##       hv243a splits as  LR, improve=0.1818572, (0 missing)
##       hv207  splits as  LR, improve=0.1648626, (0 missing)
##       hv214  splits as  -LLLL--L-RLLR--R, improve=0.1607202, (0 missing)
##       hv211  splits as  LR, improve=0.1307780, (0 missing)
##   Surrogate splits:
##       hv230a splits as  RLLRL, agree=0.588, adj=0.071, (0 split)
##       hv201  splits as  RLRRRRLLLLL----, agree=0.585, adj=0.064, (0 split)
##       hv244  splits as  RL, agree=0.569, adj=0.028, (0 split)
##       hv011  < 2.5   to the left,  agree=0.566, adj=0.022, (0 split)
##       hv214  splits as  -LLLL--R-LLLR--R, agree=0.566, adj=0.022, (0 split)
## 
## Node number 41: 136 observations
##   mean=26910.32, MSE=1.552552e+09 
## 
## Node number 42: 341 observations
##   mean=4172.956, MSE=6.548029e+08 
## 
## Node number 43: 270 observations
##   mean=34765.3, MSE=8.891334e+08 
## 
## Node number 52: 958 observations,    complexity param=0.001722961
##   mean=61292.62, MSE=8.079664e+08 
##   left son=104 (540 obs) right son=105 (418 obs)
##   Primary splits:
##       hv243b splits as  LR, improve=0.2187494, (0 missing)
##       hv247  splits as  LR, improve=0.1816620, (0 missing)
##       hv201  splits as  RLLLLLLLLLLRLRR, improve=0.1357486, (0 missing)
##       hv207  splits as  LR, improve=0.1268753, (0 missing)
##       hv205  splits as  RRRRRRRLLLRR, improve=0.1246600, (0 missing)
##   Surrogate splits:
##       hv247 splits as  LR,           agree=0.641, adj=0.177, (0 split)
##       hv211 splits as  LR,           agree=0.603, adj=0.091, (0 split)
##       hv040 < 87.5  to the left,     agree=0.597, adj=0.077, (0 split)
##       hv207 splits as  LR,           agree=0.591, adj=0.062, (0 split)
##       hv205 splits as  RRLLLRLLLLRL, agree=0.588, adj=0.055, (0 split)
## 
## Node number 53: 112 observations
##   mean=110667.8, MSE=1.05465e+09 
## 
## Node number 54: 245 observations
##   mean=102317.7, MSE=9.073984e+08 
## 
## Node number 55: 22 observations
##   mean=186294.7, MSE=4.479495e+09 
## 
## Node number 56: 328 observations
##   mean=125053.2, MSE=7.338539e+08 
## 
## Node number 57: 200 observations
##   mean=153915.3, MSE=8.269976e+08 
## 
## Node number 58: 390 observations
##   mean=173666.8, MSE=8.879907e+08 
## 
## Node number 59: 117 observations
##   mean=219744.8, MSE=1.460243e+09 
## 
## Node number 60: 85 observations
##   mean=187965.4, MSE=8.945052e+08 
## 
## Node number 61: 122 observations
##   mean=234131.5, MSE=1.274754e+09 
## 
## Node number 76: 1780 observations,    complexity param=0.001254925
##   mean=-66465.59, MSE=2.992565e+08 
##   left son=152 (1487 obs) right son=153 (293 obs)
##   Primary splits:
##       hv214  splits as  LLLLLL--LRLLR--R, improve=0.2315178, (0 missing)
##       hv205  splits as  --R--RRLLLLL, improve=0.1510356, (0 missing)
##       hv211  splits as  LR, improve=0.1405112, (0 missing)
##       hv243b splits as  LR, improve=0.1279039, (0 missing)
##       hv201  splits as  RRRRRRRRRLLL--R, improve=0.1240041, (0 missing)
##   Surrogate splits:
##       hv045c splits as  RLLLLL, agree=0.836, adj=0.003, (0 split)
##       hv201  splits as  LLLLLLLLLLLL--R, agree=0.836, adj=0.003, (0 split)
##       hv205  splits as  --R--LLLLLLL, agree=0.836, adj=0.003, (0 split)
##       hv209  splits as  LR, agree=0.836, adj=0.003, (0 split)
##       hv221  splits as  LR, agree=0.836, adj=0.003, (0 split)
## 
## Node number 77: 16 observations
##   mean=37991.12, MSE=9.411255e+09 
## 
## Node number 80: 866 observations
##   mean=-33089.14, MSE=2.802928e+08 
## 
## Node number 81: 690 observations
##   mean=-14107.6, MSE=5.363245e+08 
## 
## Node number 104: 540 observations
##   mean=49595.98, MSE=6.197975e+08 
## 
## Node number 105: 418 observations
##   mean=76403.12, MSE=6.459861e+08 
## 
## Node number 152: 1487 observations
##   mean=-70160.4, MSE=2.236666e+08 
## 
## Node number 153: 293 observations
##   mean=-47714.11, MSE=2.619799e+08
rpart.plot(dt3, extra=1, main="Decision Tree with cp=0.001")

pred3 <- predict(object=dt3, newdata = testdf)
rmse3 <- rmse(actual=testdf$hv271, predicted = pred3 )
rmse3
## [1] 25401.74
dt3$variable.importance
##        hv226        hv025        hv206        hv208        hv209        hv244 
## 6.207302e+13 3.763530e+13 3.740209e+13 3.696917e+13 2.277231e+13 1.431742e+13 
##        hv213        hv214        hv247       hv243b        hv205       hv243e 
## 1.403263e+13 6.623549e+12 5.210779e+12 3.482801e+12 3.263280e+12 9.046184e+11 
##        hv215       hv243a        hv241        hv212        hv201        hv216 
## 7.184846e+11 4.611081e+11 4.094277e+11 3.984551e+11 1.915529e+11 1.440478e+11 
##        hv207        hv220        hv012        hv221       hv246f        hv246 
## 1.285546e+11 1.065710e+11 7.838628e+10 7.308242e+10 5.169357e+10 5.087532e+10 
##        hv211        hv237        hv040       hv230a        hv011        hv210 
## 3.367391e+10 2.952192e+10 2.591376e+10 2.113099e+10 1.489117e+10 9.506623e+09 
##       hv246b       hv045c       hv243d       hv246d       hv246a 
## 9.494544e+09 7.957712e+09 5.063757e+09 4.430787e+09 4.365451e+09
df2 <- data.frame(imp = dt2$variable.importance)
df3 <- df2 %>% 
  tibble::rownames_to_column() %>% 
  dplyr::rename("variable" = rowname) %>% 
  dplyr::arrange(imp) %>%
  dplyr::mutate(variable = forcats::fct_inorder(variable))
ggplot2::ggplot(df3) +
  geom_col(aes(x = variable, y = imp),
           col = "black", fill="darkgreen", show.legend = F) +
  coord_flip() +
  scale_fill_grey() +
  theme_bw()